import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import seaborn as sns
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense,Input
from tensorflow.keras.callbacks import EarlyStopping,ModelCheckpoint
from scikeras.wrappers import KerasClassifier
from imblearn.pipeline import Pipeline as IMBPipeline
from sklearn.model_selection import train_test_split, cross_val_score,GridSearchCV,cross_val_predict,StratifiedKFold
from sklearn.metrics import classification_report,precision_recall_curve,confusion_matrix,accuracy_score,log_loss
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder,LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.impute import KNNImputer
from sklearn.impute import IterativeImputer
from sklearn.linear_model import SGDClassifier
from sklearn.svm import SVC
from sklearn.decomposition import PCA
from imblearn.over_sampling import SMOTE,ADASYN
from imblearn.pipeline import Pipeline, make_pipeline
from matplotlib.colors import ListedColormap
from collections import Counter
Link to kaggle competition https://www.kaggle.com/competitions/playground-series-s3e26/overview \ Link to kaggle page related to original dataset: https://www.kaggle.com/datasets/joebeachcapital/cirrhosis-patient-survival-prediction
Read in data:
train_file = 'train.csv'
train = pd.read_csv(train_file, keep_default_na=False, na_values= 'null')
train.head()
| id | N_Days | Drug | Age | Sex | Ascites | Hepatomegaly | Spiders | Edema | Bilirubin | Cholesterol | Albumin | Copper | Alk_Phos | SGOT | Tryglicerides | Platelets | Prothrombin | Stage | Status | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 999 | D-penicillamine | 21532 | M | N | N | N | N | 2.3 | 316.0 | 3.35 | 172.0 | 1601.0 | 179.80 | 63.0 | 394.0 | 9.7 | 3.0 | D |
| 1 | 1 | 2574 | Placebo | 19237 | F | N | N | N | N | 0.9 | 364.0 | 3.54 | 63.0 | 1440.0 | 134.85 | 88.0 | 361.0 | 11.0 | 3.0 | C |
| 2 | 2 | 3428 | Placebo | 13727 | F | N | Y | Y | Y | 3.3 | 299.0 | 3.55 | 131.0 | 1029.0 | 119.35 | 50.0 | 199.0 | 11.7 | 4.0 | D |
| 3 | 3 | 2576 | Placebo | 18460 | F | N | N | N | N | 0.6 | 256.0 | 3.50 | 58.0 | 1653.0 | 71.30 | 96.0 | 269.0 | 10.7 | 3.0 | C |
| 4 | 4 | 788 | Placebo | 16658 | F | N | Y | N | N | 1.1 | 346.0 | 3.65 | 63.0 | 1181.0 | 125.55 | 96.0 | 298.0 | 10.6 | 4.0 | C |
test_file = 'test.csv'
test = pd.read_csv(test_file, keep_default_na=False, na_values= 'null')
#test.head()
View size of datasets
print(train.shape)
print(test.shape) # test has target "status" column removed
(7905, 20) (5271, 19)
train.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 7905 entries, 0 to 7904 Data columns (total 20 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 id 7905 non-null int64 1 N_Days 7905 non-null int64 2 Drug 7905 non-null object 3 Age 7905 non-null int64 4 Sex 7905 non-null object 5 Ascites 7905 non-null object 6 Hepatomegaly 7905 non-null object 7 Spiders 7905 non-null object 8 Edema 7905 non-null object 9 Bilirubin 7905 non-null float64 10 Cholesterol 7905 non-null float64 11 Albumin 7905 non-null float64 12 Copper 7905 non-null float64 13 Alk_Phos 7905 non-null float64 14 SGOT 7905 non-null float64 15 Tryglicerides 7905 non-null float64 16 Platelets 7905 non-null float64 17 Prothrombin 7905 non-null float64 18 Stage 7905 non-null float64 19 Status 7905 non-null object dtypes: float64(10), int64(3), object(7) memory usage: 1.2+ MB
Feature Descriptions
train.nunique()
id 7905 N_Days 461 Drug 2 Age 391 Sex 2 Ascites 2 Hepatomegaly 2 Spiders 2 Edema 3 Bilirubin 111 Cholesterol 226 Albumin 160 Copper 171 Alk_Phos 364 SGOT 206 Tryglicerides 154 Platelets 227 Prothrombin 49 Stage 4 Status 3 dtype: int64
#call summary stats on training data
train.describe()
# mean age in trial is 18373 days ≈ 50.3 yrs
# mean number of days in trial is 2030 ≈ 5.6 years
| id | N_Days | Age | Bilirubin | Cholesterol | Albumin | Copper | Alk_Phos | SGOT | Tryglicerides | Platelets | Prothrombin | Stage | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 7905.000000 | 7905.000000 | 7905.000000 | 7905.000000 | 7905.000000 | 7905.000000 | 7905.000000 | 7905.000000 | 7905.000000 | 7905.000000 | 7905.000000 | 7905.000000 | 7905.000000 |
| mean | 3952.000000 | 2030.173308 | 18373.146490 | 2.594485 | 350.561923 | 3.548323 | 83.902846 | 1816.745250 | 114.604602 | 115.340164 | 265.228969 | 10.629462 | 3.032511 |
| std | 2282.121272 | 1094.233744 | 3679.958739 | 3.812960 | 195.379344 | 0.346171 | 75.899266 | 1903.750657 | 48.790945 | 52.530402 | 87.465579 | 0.781735 | 0.866511 |
| min | 0.000000 | 41.000000 | 9598.000000 | 0.300000 | 120.000000 | 1.960000 | 4.000000 | 289.000000 | 26.350000 | 33.000000 | 62.000000 | 9.000000 | 1.000000 |
| 25% | 1976.000000 | 1230.000000 | 15574.000000 | 0.700000 | 248.000000 | 3.350000 | 39.000000 | 834.000000 | 75.950000 | 84.000000 | 211.000000 | 10.000000 | 2.000000 |
| 50% | 3952.000000 | 1831.000000 | 18713.000000 | 1.100000 | 298.000000 | 3.580000 | 63.000000 | 1181.000000 | 108.500000 | 104.000000 | 265.000000 | 10.600000 | 3.000000 |
| 75% | 5928.000000 | 2689.000000 | 20684.000000 | 3.000000 | 390.000000 | 3.770000 | 102.000000 | 1857.000000 | 137.950000 | 139.000000 | 316.000000 | 11.000000 | 4.000000 |
| max | 7904.000000 | 4795.000000 | 28650.000000 | 28.000000 | 1775.000000 | 4.640000 | 588.000000 | 13862.400000 | 457.250000 | 598.000000 | 563.000000 | 18.000000 | 4.000000 |
train.hist(bins=100, figsize=(20,15)) # producing histograms features in our dataset (note: id is a unique identifier, we are not interested in this)
plt.show()
Many of the features seem to contain a large number of outliers. Addtionally, many seem to have heavy tailed distributions. This is something we will have to deal with in data cleaning.
We will take a closer look at the distribution of our numeric features below.
#N_Days has 461 unique values
train['N_Days'].hist(edgecolor="black", bins= 10)
plt.title("Number of days")
plt.show()
#Age has 391 unique values
train['Age'].hist(edgecolor="black")
plt.title("Age of Patients (days)")
plt.show()
The Age is somewhat normally distributed, with its peak lying at 20,000 days (approximately 55 years of age)
#bili has 111 unique values
train['Bilirubin'].hist(edgecolor="black")
plt.title("Distribution of Bilirubin levels")
plt.show()
Bilirubin levels has a very heavy tailed distribution, with some observations occuring in very few cases
#cholest has 226 unique values
train['Cholesterol'].hist(edgecolor="black")
plt.title("Distribution of Cholesterol levels")
plt.show()
Again Cholesterol has a very heavy tailed distribution
#albumin has 160 unique values
train['Albumin'].hist(edgecolor="black")
plt.title("Distribution of Albumin levels")
plt.show()
#copper has 171 unique values
train['Copper'].hist(edgecolor="black")
plt.title("Distribution of Copper levels (urine)")
plt.show()
Copper level distribution is heavy tailed
#alk phos has 364 unique values
train['Alk_Phos'].hist(edgecolor="black")
plt.title("Distribution of Alkaline Phosphatase levels")
plt.show()
Alkaline-Phosphate distribution is heavy tailed
#sgot has 206 unique values
train['SGOT'].hist(edgecolor="black")
plt.title("Distribution of SGOT levels")
plt.show()
SGOT appears to have both a heavy tailed distribution and outlier values.
#TG has 154 unique values
train['Tryglicerides'].hist(edgecolor="black")
plt.title("Distribution of Triglyceride levels")
plt.show()
Triglyceride levels, have a heavy tailed distribution and outlier values.
#plts has 227 unique values
train['Platelets'].hist(edgecolor="black")
plt.title("Distribution of Platelet counts")
plt.show()
#prothrombin has 49 unique values
train['Prothrombin'].hist(edgecolor="black")
plt.title("Distribution of Prothrombin times")
plt.show()
train['Prothrombin'].max() # clear outlier value
18.0
Prothrombin times has some extreme outliers, without these the distribution is fair.
Visualize categorical data
train['Drug'].hist(bins=2, edgecolor="black")
plt.title("Distribution of Drugs")
plt.show()
train['Sex'].hist(bins=2, edgecolor="black")
plt.title("Distribution of Sex of Patients")
plt.show()
# very unbalanced, unclear why this disbalance would occur.
# could investigate if there is a relevant difference in data between two classes
This was the most unexpected distribution of the categorical variables, females represent a significantly larger group than men in this dataset.
This data implies (assuming it is representative of the population) that women are significantly more likely to suffer from liver disease and/or more likely to become involved with a study.
The CDC's article 'Excessive Alcohol Use is a Risk to Women’s Health' states " ... differences make women more susceptible to the long-term negative health effects of alcohol compared with men". As such this data might be reasonable.
train['Ascites'].hist(bins=2, edgecolor="black")
plt.title("Distribution of Ascites in Patients")
plt.show()
train['Hepatomegaly'].hist(bins=2, edgecolor="black")
plt.title("Distribution of Hepatomegaly in Patients")
plt.show()
train['Spiders'].hist(bins=2, edgecolor="black")
plt.title("Distribution of Spider Veins in Patients")
plt.show()
train['Edema'].hist(bins=3, edgecolor="black")
plt.title("Presence of Edema in Patients")
Text(0.5, 1.0, 'Presence of Edema in Patients')
Most patients do not present edema.
train['Stage'].hist(bins=4, edgecolor="black")
plt.title("Distribution of Liver Disease Stage")
plt.show()
The greatest proportion of patients are stage 3 and 4.
train['Status'].hist(bins=3, edgecolor="black")
<Axes: >
Clear class imbalance here.
sns.pairplot(train, hue="Stage", corner=True, palette='colorblind')
plt.show()
From the pairwise scatterplot, no linear relationships are immediately apparent, mostly just clusters with some outliers.
#creating a table of correlations between numeric variables
correlations = train.corr(numeric_only=True)
correlations
| id | N_Days | Age | Bilirubin | Cholesterol | Albumin | Copper | Alk_Phos | SGOT | Tryglicerides | Platelets | Prothrombin | Stage | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| id | 1.000000 | -0.011638 | -0.008351 | 0.007194 | -0.011046 | -0.019808 | 0.008203 | -0.004393 | 0.020395 | -0.006650 | -0.007707 | 0.007979 | -0.011391 |
| N_Days | -0.011638 | 1.000000 | -0.102354 | -0.346434 | -0.145811 | 0.255724 | -0.284355 | -0.030874 | -0.240918 | -0.186453 | 0.147626 | -0.156032 | -0.216820 |
| Age | -0.008351 | -0.102354 | 1.000000 | 0.099016 | -0.053876 | -0.114848 | 0.095199 | 0.025879 | -0.020768 | 0.021767 | -0.094822 | 0.141705 | 0.118294 |
| Bilirubin | 0.007194 | -0.346434 | 0.099016 | 1.000000 | 0.302153 | -0.303191 | 0.442223 | 0.131317 | 0.368653 | 0.315681 | -0.081987 | 0.294325 | 0.200134 |
| Cholesterol | -0.011046 | -0.145811 | -0.053876 | 0.302153 | 1.000000 | -0.091830 | 0.168266 | 0.129131 | 0.326864 | 0.274044 | 0.091455 | 0.023761 | 0.037372 |
| Albumin | -0.019808 | 0.255724 | -0.114848 | -0.303191 | -0.091830 | 1.000000 | -0.218479 | -0.083582 | -0.200928 | -0.112304 | 0.141284 | -0.204600 | -0.233245 |
| Copper | 0.008203 | -0.284355 | 0.095199 | 0.442223 | 0.168266 | -0.218479 | 1.000000 | 0.124058 | 0.323226 | 0.290435 | -0.107894 | 0.238771 | 0.182007 |
| Alk_Phos | -0.004393 | -0.030874 | 0.025879 | 0.131317 | 0.129131 | -0.083582 | 0.124058 | 1.000000 | 0.128746 | 0.087789 | 0.047869 | 0.079517 | 0.061326 |
| SGOT | 0.020395 | -0.240918 | -0.020768 | 0.368653 | 0.326864 | -0.200928 | 0.323226 | 0.128746 | 1.000000 | 0.155287 | -0.042004 | 0.136766 | 0.118419 |
| Tryglicerides | -0.006650 | -0.186453 | 0.021767 | 0.315681 | 0.274044 | -0.112304 | 0.290435 | 0.087789 | 0.155287 | 1.000000 | 0.006511 | 0.063582 | 0.073614 |
| Platelets | -0.007707 | 0.147626 | -0.094822 | -0.081987 | 0.091455 | 0.141284 | -0.107894 | 0.047869 | -0.042004 | 0.006511 | 1.000000 | -0.169741 | -0.175960 |
| Prothrombin | 0.007979 | -0.156032 | 0.141705 | 0.294325 | 0.023761 | -0.204600 | 0.238771 | 0.079517 | 0.136766 | 0.063582 | -0.169741 | 1.000000 | 0.254674 |
| Stage | -0.011391 | -0.216820 | 0.118294 | 0.200134 | 0.037372 | -0.233245 | 0.182007 | 0.061326 | 0.118419 | 0.073614 | -0.175960 | 0.254674 | 1.000000 |
# Plot miles per stage versus N_Days
sns.relplot(x="N_Days", y="SGOT", hue="Stage",
sizes=(40, 400), alpha=.5, palette="colorblind",
height=6, data=train).set(title="N_days vs SGOT and Stage")
plt.show()
Not especially useful.
Except a grouping of stage 4 patients at the start (0), of N_days. So more patients at the beginning were stage 4? A lower proportion of stage 4 patients spent a long time in study?
# Plot miles per stage versus N_Days
sns.relplot(x="Age", y="SGOT", hue="Stage",
sizes=(40, 400), alpha=.5, palette="colorblind",
height=6, data=train).set(title="Age vs SGOT and Stage")
plt.show()
Towards the far end of the Age (>25000 or 68.5 years), most of the patients are stage 4, the left side of age has more green, so younger patients are stage 3. Notably very few blue or stage 1 dots in the plot.
# Draw a nested barplot by stage and sex
g = sns.catplot(
data=train, kind="bar",
x="Status", y="Bilirubin", hue="Sex",
errorbar="sd", palette="dark", alpha=.6, height=6
)
g.despine(left=True)
g.set_axis_labels()
g.legend.set_title("Sex")
g.set(title="Bilirubin and Status by Sex")
<seaborn.axisgrid.FacetGrid at 0x19be91d9430>
Males in the study appear to be more likely to survive. i.e higher proportion of men with C or CL status.
This could be a sampling related phenomenom or related to attitudes in medicine and healthcare at the time this study was conducted (1974 to 1984)
Boxplots to visualize Target variable with continuous variables (namely lab tests):
sns.catplot(data=train, x="Status", y="Bilirubin",
hue="Stage", kind="box").set(title='Bilirubin by Status and Stage')
plt.show()
The range of values for Bilirubin is all much more spread out in patients who died. Also, across all three outcomes for Status, the patients who had stage 4 liver disease had higher bilirubin values on average than earlier stages.
sns.catplot(data=train, x="Status", y="Cholesterol",
hue="Stage", kind="box").set(title='Cholesterol by Status and Stage')
plt.show()
sns.catplot(data=train, x="Status", y="Albumin",
hue="Stage", kind="box").set(title='Albumin by Status and Stage')
plt.show()
sns.catplot(data=train, x="Status", y="Copper",
hue="Stage", kind="box").set(title='Copper by Status and Stage')
plt.show()
Notably, copper values were much higher across all patients who died, regardless of their stage, when compared to other outcomes.
sns.catplot(data=train, x="Status", y="Alk_Phos",
hue="Stage", kind="box").set(title='Alk_Phos by Status and Stage')
plt.show()
These values have lots of outliers.
sns.catplot(data=train, x="Status", y="SGOT",
hue="Stage", kind="box").set(title='SGOT by Status and Stage')
plt.show()
Across all three outcomes, patients with stage 4 liver disease had higher prothrombin values in each outcome category.
sns.catplot(data=train, x="Status", y="Tryglicerides",
hue="Stage", kind="box").set(title='Triglycerides by Status and Stage')
plt.show()
sns.catplot(data=train, x="Status", y="Platelets",
hue="Stage", kind="box").set(title='Platelets by Status and Stage')
plt.show()
sns.catplot(data=train, x="Status", y="Prothrombin",
hue="Stage", kind="box").set(title='Prothrombin by Status and Stage')
plt.show()
Stage v Status
cross_tab = pd.crosstab(train['Stage'], train['Status'])
print(cross_tab)
ratio = (cross_tab.iloc[:, 0] + cross_tab.iloc[:, 1])/ cross_tab.iloc[:, 2]
# ratio of surviving patients to patients who die
ratio
Status C CL D Stage 1.0 351 7 39 2.0 1293 47 312 3.0 2298 113 742 4.0 1023 108 1572
Stage 1.0 9.179487 2.0 4.294872 3.0 3.249326 4.0 0.719466 dtype: float64
Suggests a clear association between response variable and stage of disease. More advanced the disease the more likely you are to die.
Sex v Status
cross_tab = pd.crosstab(train['Sex'], train['Status'])
print(cross_tab)
ratio = (cross_tab.iloc[:, 0] + cross_tab.iloc[:, 1])/ cross_tab.iloc[:, 2]
ratio
Status C CL D Sex F 4735 251 2350 M 230 24 315
Sex F 2.121702 M 0.806349 dtype: float64
Males with liver disease seem to have a higher proportion of deaths.
Drug v Status
cross_tab = pd.crosstab(train['Drug'], train['Status'])
print(cross_tab)
ratio = (cross_tab.iloc[:, 0] + cross_tab.iloc[:, 1])/ cross_tab.iloc[:, 2]
# ratio of surviving patients to patients who die
ratio
Status C CL D Drug D-penicillamine 2405 151 1339 Placebo 2560 124 1326
Drug D-penicillamine 1.908887 Placebo 2.024133 dtype: float64
Very weak, potentially non-existent association between Drug and outcome.
Note: I have simplified the response variable in 'ratio', Drug could potentially still be valuable?
Edema v Status
cross_tab = pd.crosstab(train['Edema'], train['Status'])
print(cross_tab)
ratio = (cross_tab.iloc[:, 0] + cross_tab.iloc[:, 1])/ cross_tab.iloc[:, 2]
# ratio of surviving patients to patients who die
ratio
Status C CL D Edema N 4847 257 2057 S 110 16 273 Y 8 2 335
Edema N 2.481283 S 0.461538 Y 0.029851 dtype: float64
Sex v Drug
cross_tab = pd.crosstab(train['Sex'], train['Drug'])
print(cross_tab)
ratio = cross_tab.iloc[:, 0] / cross_tab.iloc[:, 1]
ratio #suggest males where more likely to be given the drug than females
# although could the sample of men be unrepresentative
Drug D-penicillamine Placebo Sex F 3569 3767 M 326 243
Sex F 0.947438 M 1.341564 dtype: float64
Stage v Drug
cross_tab = pd.crosstab(train['Stage'], train['Drug'])
print(cross_tab)
ratio = cross_tab.iloc[:, 0] / cross_tab.iloc[:, 1]
ratio # suggests there isnt a huge association between stage and drug
# although stage 4 seems to have mor epeople on placebo (association here maybe)
Drug D-penicillamine Placebo Stage 1.0 204 193 2.0 853 799 3.0 1559 1594 4.0 1279 1424
Stage 1.0 1.056995 2.0 1.067584 3.0 0.978043 4.0 0.898174 dtype: float64
Sex v Stage
cross_tab = pd.crosstab(train['Sex'], train['Stage'])
sns.heatmap(cross_tab, annot=True, cmap="YlGnBu")
<Axes: xlabel='Stage', ylabel='Sex'>
While the distribution earlier showed most in trial were in stage 3 damage. This shows the biggest group for males is stage 4 as opposed to stage 3 for females.
Stage v Drug
cross_tab = pd.crosstab(train['Stage'], train['Drug'])
print(cross_tab)
ratio = cross_tab.iloc[:, 0] / cross_tab.iloc[:, 1]
ratio # suggests there isnt a huge association between stage and drug
# although stage 4 seems to have more people on placebo (association here maybe)
Drug D-penicillamine Placebo Stage 1.0 204 193 2.0 853 799 3.0 1559 1594 4.0 1279 1424
Stage 1.0 1.056995 2.0 1.067584 3.0 0.978043 4.0 0.898174 dtype: float64
Stage v Ascites
cross_tab = pd.crosstab(train['Stage'], train['Ascites'])
print(cross_tab)
ratio = cross_tab.iloc[:, 0] / cross_tab.iloc[:, 1]
ratio
Ascites N Y Stage 1.0 396 1 2.0 1632 20 3.0 3080 73 4.0 2417 286
Stage 1.0 396.000000 2.0 81.600000 3.0 42.191781 4.0 8.451049 dtype: float64
clear trend of ascites occuring in more developed instances of liver damages
Stage v Hepatomegaly
cross_tab = pd.crosstab(train['Stage'], train['Hepatomegaly'])
print(cross_tab)
ratio = cross_tab.iloc[:, 0] / cross_tab.iloc[:, 1]
ratio
Hepatomegaly N Y Stage 1.0 358 39 2.0 1240 412 3.0 1887 1266 4.0 378 2325
Stage 1.0 9.179487 2.0 3.009709 3.0 1.490521 4.0 0.162581 dtype: float64
This suggests hepatomegaly is found less in more advanced cases of liver disease
online search suggests this makes sense:
"In the early stages of liver disease, hepatomegaly (enlargement of the liver) might be present due to inflammation, fatty infiltration, or other factors. However, as the disease progresses to advanced stages, the liver may become nodular and shrunken, a condition known as "cirrhotic liver" or "cirrhotic hepatomegaly.""
Stage v Spiders
cross_tab = pd.crosstab(train['Stage'], train['Spiders'])
print(cross_tab)
ratio = cross_tab.iloc[:, 0] / cross_tab.iloc[:, 1]
ratio
Spiders N Y Stage 1.0 370 27 2.0 1454 198 3.0 2587 566 4.0 1555 1148
Stage 1.0 13.703704 2.0 7.343434 3.0 4.570671 4.0 1.354530 dtype: float64
Stage v Edema
cross_tab = pd.crosstab(train['Stage'], train['Edema'])
print(cross_tab)
sns.heatmap(cross_tab, annot=True, cmap="YlGnBu")
Edema N S Y Stage 1.0 391 5 1 2.0 1596 41 15 3.0 2976 124 53 4.0 2198 229 276
<Axes: xlabel='Edema', ylabel='Stage'>
Edema becomes more likely the more advanced the liver disease
OD_train_path = 'train.csv'
OD_train = pd.read_csv(OD_train_path)
OD_train
| id | N_Days | Drug | Age | Sex | Ascites | Hepatomegaly | Spiders | Edema | Bilirubin | Cholesterol | Albumin | Copper | Alk_Phos | SGOT | Tryglicerides | Platelets | Prothrombin | Stage | Status | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 999 | D-penicillamine | 21532 | M | N | N | N | N | 2.3 | 316.0 | 3.35 | 172.0 | 1601.0 | 179.80 | 63.0 | 394.0 | 9.7 | 3.0 | D |
| 1 | 1 | 2574 | Placebo | 19237 | F | N | N | N | N | 0.9 | 364.0 | 3.54 | 63.0 | 1440.0 | 134.85 | 88.0 | 361.0 | 11.0 | 3.0 | C |
| 2 | 2 | 3428 | Placebo | 13727 | F | N | Y | Y | Y | 3.3 | 299.0 | 3.55 | 131.0 | 1029.0 | 119.35 | 50.0 | 199.0 | 11.7 | 4.0 | D |
| 3 | 3 | 2576 | Placebo | 18460 | F | N | N | N | N | 0.6 | 256.0 | 3.50 | 58.0 | 1653.0 | 71.30 | 96.0 | 269.0 | 10.7 | 3.0 | C |
| 4 | 4 | 788 | Placebo | 16658 | F | N | Y | N | N | 1.1 | 346.0 | 3.65 | 63.0 | 1181.0 | 125.55 | 96.0 | 298.0 | 10.6 | 4.0 | C |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 7900 | 7900 | 1166 | D-penicillamine | 16839 | F | N | N | N | N | 0.8 | 309.0 | 3.56 | 38.0 | 1629.0 | 79.05 | 224.0 | 344.0 | 9.9 | 2.0 | C |
| 7901 | 7901 | 1492 | Placebo | 17031 | F | N | Y | N | N | 0.9 | 260.0 | 3.43 | 62.0 | 1440.0 | 142.00 | 78.0 | 277.0 | 10.0 | 4.0 | C |
| 7902 | 7902 | 1576 | D-penicillamine | 25873 | F | N | N | Y | S | 2.0 | 225.0 | 3.19 | 51.0 | 933.0 | 69.75 | 62.0 | 200.0 | 12.7 | 2.0 | D |
| 7903 | 7903 | 3584 | D-penicillamine | 22960 | M | N | Y | N | N | 0.7 | 248.0 | 2.75 | 32.0 | 1003.0 | 57.35 | 118.0 | 221.0 | 10.6 | 4.0 | D |
| 7904 | 7904 | 1978 | D-penicillamine | 19237 | F | N | N | N | N | 0.7 | 256.0 | 3.23 | 22.0 | 645.0 | 74.40 | 85.0 | 336.0 | 10.3 | 3.0 | C |
7905 rows × 20 columns
OD_train.hist(bins=100, figsize=(20,15))
plt.show()
Based on the distributions we might have some issue using this data in models which are sensitive to outliers/heavy tails.
Features with heavy tails/ lots of outliers include Prothrombin, Tryglicerides, SGOT, Alk_Phos, Copper, Cholesterol and Bilirubin.
Since there are so many of these 'outliers' it wouldn't make sense to delete rows. Instead we will take the log of these values to reduce the impact of extrema.
high_outlier_data = ['Prothrombin', 'Tryglicerides', 'SGOT', 'Alk_Phos', 'Copper', 'Cholesterol', 'Bilirubin']
def log_adjust_outliers(df, outlier_cols): # we can use this function in pre-procesing
for col in outlier_cols:
df[col] = np.log(df[col])
return df
#apply this function to a copy of OD_train
train_copy = OD_train.copy()
log_adjust_copy = log_adjust_outliers(train_copy, high_outlier_data)
log_adjust_copy.hist(bins=100, figsize=(20,15))
array([[<Axes: title={'center': 'id'}>,
<Axes: title={'center': 'N_Days'}>,
<Axes: title={'center': 'Age'}>,
<Axes: title={'center': 'Bilirubin'}>],
[<Axes: title={'center': 'Cholesterol'}>,
<Axes: title={'center': 'Albumin'}>,
<Axes: title={'center': 'Copper'}>,
<Axes: title={'center': 'Alk_Phos'}>],
[<Axes: title={'center': 'SGOT'}>,
<Axes: title={'center': 'Tryglicerides'}>,
<Axes: title={'center': 'Platelets'}>,
<Axes: title={'center': 'Prothrombin'}>],
[<Axes: title={'center': 'Stage'}>, <Axes: >, <Axes: >, <Axes: >]],
dtype=object)
This has greatly improved the distribution of feature values.
However, we can see Prothrombin still has some extreme outliers.
We can remove the rows where log Prothrombin is greater than 2.65.
indices_to_drop = train_copy.index[train_copy['Prothrombin'] > 2.65]
len(indices_to_drop) # only 15 rows (this is an acceptable loss, 1.9% of data)
train_copy.drop(indices_to_drop, inplace=True)
train_copy.hist(bins=100, figsize=(20,15))
array([[<Axes: title={'center': 'id'}>,
<Axes: title={'center': 'N_Days'}>,
<Axes: title={'center': 'Age'}>,
<Axes: title={'center': 'Bilirubin'}>],
[<Axes: title={'center': 'Cholesterol'}>,
<Axes: title={'center': 'Albumin'}>,
<Axes: title={'center': 'Copper'}>,
<Axes: title={'center': 'Alk_Phos'}>],
[<Axes: title={'center': 'SGOT'}>,
<Axes: title={'center': 'Tryglicerides'}>,
<Axes: title={'center': 'Platelets'}>,
<Axes: title={'center': 'Prothrombin'}>],
[<Axes: title={'center': 'Stage'}>, <Axes: >, <Axes: >, <Axes: >]],
dtype=object)
These distributions are looking better. We expect these to perform better in our models.
OD_train_copy = OD_train.copy()
OD_train_copy['Age_in_Years'] = OD_train_copy['Age'] / 365 # creating a column which is age in years
bin_edges = [20, 35, 45, 60, 80]
bin_labels = ['20-35', '35-45', '45-60', '60-80']
OD_train_copy['Age_Category'] = pd.cut(OD_train_copy['Age_in_Years'], bins=bin_edges, labels=bin_labels, right=False)
# converting age in years to age categories
OD_train_copy.head()
| id | N_Days | Drug | Age | Sex | Ascites | Hepatomegaly | Spiders | Edema | Bilirubin | ... | Copper | Alk_Phos | SGOT | Tryglicerides | Platelets | Prothrombin | Stage | Status | Age_in_Years | Age_Category | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 999 | D-penicillamine | 21532 | M | N | N | N | N | 2.3 | ... | 172.0 | 1601.0 | 179.80 | 63.0 | 394.0 | 9.7 | 3.0 | D | 58.991781 | 45-60 |
| 1 | 1 | 2574 | Placebo | 19237 | F | N | N | N | N | 0.9 | ... | 63.0 | 1440.0 | 134.85 | 88.0 | 361.0 | 11.0 | 3.0 | C | 52.704110 | 45-60 |
| 2 | 2 | 3428 | Placebo | 13727 | F | N | Y | Y | Y | 3.3 | ... | 131.0 | 1029.0 | 119.35 | 50.0 | 199.0 | 11.7 | 4.0 | D | 37.608219 | 35-45 |
| 3 | 3 | 2576 | Placebo | 18460 | F | N | N | N | N | 0.6 | ... | 58.0 | 1653.0 | 71.30 | 96.0 | 269.0 | 10.7 | 3.0 | C | 50.575342 | 45-60 |
| 4 | 4 | 788 | Placebo | 16658 | F | N | Y | N | N | 1.1 | ... | 63.0 | 1181.0 | 125.55 | 96.0 | 298.0 | 10.6 | 4.0 | C | 45.638356 | 45-60 |
5 rows × 22 columns
We have decided to bin these features as we hope it will reduce noise within our data and have a clearer relationship with our response variable.
OD_train_copy['Years_in_trial'] = OD_train_copy['N_Days'] / 365 # time in trial in years
bin_edges = [0, 2, 5, 10, 15]
bin_labels = ['0-2', '2-5', '5-10', '10-15']
OD_train_copy['Years_Category'] = pd.cut(OD_train_copy['Years_in_trial'], bins=bin_edges, labels=bin_labels, right=False)
# create categories for time in trial
OD_train_copy.head()
| id | N_Days | Drug | Age | Sex | Ascites | Hepatomegaly | Spiders | Edema | Bilirubin | ... | SGOT | Tryglicerides | Platelets | Prothrombin | Stage | Status | Age_in_Years | Age_Category | Years_in_trial | Years_Category | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 999 | D-penicillamine | 21532 | M | N | N | N | N | 2.3 | ... | 179.80 | 63.0 | 394.0 | 9.7 | 3.0 | D | 58.991781 | 45-60 | 2.736986 | 2-5 |
| 1 | 1 | 2574 | Placebo | 19237 | F | N | N | N | N | 0.9 | ... | 134.85 | 88.0 | 361.0 | 11.0 | 3.0 | C | 52.704110 | 45-60 | 7.052055 | 5-10 |
| 2 | 2 | 3428 | Placebo | 13727 | F | N | Y | Y | Y | 3.3 | ... | 119.35 | 50.0 | 199.0 | 11.7 | 4.0 | D | 37.608219 | 35-45 | 9.391781 | 5-10 |
| 3 | 3 | 2576 | Placebo | 18460 | F | N | N | N | N | 0.6 | ... | 71.30 | 96.0 | 269.0 | 10.7 | 3.0 | C | 50.575342 | 45-60 | 7.057534 | 5-10 |
| 4 | 4 | 788 | Placebo | 16658 | F | N | Y | N | N | 1.1 | ... | 125.55 | 96.0 | 298.0 | 10.6 | 4.0 | C | 45.638356 | 45-60 | 2.158904 | 2-5 |
5 rows × 24 columns
OD_train_copy['Age_Category'].unique()
['45-60', '35-45', '60-80', '20-35'] Categories (4, object): ['20-35' < '35-45' < '45-60' < '60-80']
OD_train_copy['Years_Category'].unique()
['2-5', '5-10', '0-2', '10-15'] Categories (4, object): ['0-2' < '2-5' < '5-10' < '10-15']
OD_train_copy = OD_train_copy.drop(['N_Days', 'Years_in_trial', 'Age', 'Age_in_Years'], axis = 1) # removing the columns we don't want
OD_train_encoded = pd.get_dummies(OD_train_copy, columns=['Years_Category', 'Age_Category', 'Sex', 'Ascites', 'Hepatomegaly', 'Spiders', 'Edema', 'Drug', 'Stage', 'Status'])
plt.figure(figsize=(10, 8))
sns.heatmap(OD_train_encoded.corr(), annot=False, fmt=".2f")
plt.title('Correlation Matrix Heatmap')
plt.show()
sns.catplot(x='Age_Category', hue='Years_Category', col='Status', data=OD_train_copy, kind='count', palette='magma', height=5, aspect=1)
plt.show()
Evidence that this binning strategy maintains relationships between both Age and time in trial with Status.
We will use these new features
Do resampling (addressing unbalanced classes)
drop 'id' (unique identifier)
split DF into X (input) and Y (response)
take log of features with heavy tails
drop outliers from Prothrombin
engineer Age_Category and Years_Category
encode features as appropriate
# Load the dataset
OD_train = pd.read_csv('data/train.csv')
OD_test = pd.read_csv('data/test.csv')
def df_strings_to_numbers(entire_df): # needed before resampling techniques can be applied
entire_df['Drug'] = np.where(entire_df['Drug']=='D-penicillamine', 1, 0)
entire_df['Sex'] = np.where(entire_df['Sex']=='F', 1, 0)
entire_df['Ascites'] = np.where(entire_df['Ascites']=='Y', 1, 0)
entire_df['Hepatomegaly'] = np.where(entire_df['Hepatomegaly']=='Y', 1, 0)
entire_df['Spiders'] = np.where(entire_df['Spiders']=='Y', 1, 0)
entire_df['Edema'] = entire_df['Edema'].replace({'N': 0, 'S': 1, 'Y': 2 })
entire_df['Status'] = entire_df['Status'].replace({'D': 0, 'CL': 1, 'C': 2 })
return entire_df
def split_and_drop_entire_DF(df): # drops id from dataframe and splits df into X and Y
return df.drop(columns=['Status', 'id']), df['Status']
def log_adjust_outliers(df, outlier_cols): # takes log of heavy-tailed features
for col in outlier_cols:
df[col] = np.log(df[col])
return df
def adjust_prothrombin(df): # remove outliers from prothrombin
indices = df.index[df['Prothrombin'] > 2.65]
return df.drop(indices),indices
def Age_Categories(df): # engineers Age_Category a binning of Age
df['Age_in_Years'] = df['Age'] / 365
bin_edges = [20, 35, 45, 60, 80]
bin_labels = ['20-35', '35-45', '45-60', '60-80']
df['Age_Category'] = pd.cut(df['Age_in_Years'], bins=bin_edges, labels=bin_labels, right=False)
df = df.drop(['Age', 'Age_in_Years'], axis = 1)
return df
def Year_Categories(df): # engineers Year_Category a binning of N_Days
df['Years_in_trial'] = df['N_Days'] / 365
bin_edges = [0, 2, 5, 10, 15]
bin_labels = ['0-2', '2-5', '5-10', '10-15']
df['Years_Category'] = pd.cut(df['Years_in_trial'], bins=bin_edges, labels=bin_labels, right=False)
df = df.drop(['N_Days', 'Years_in_trial'], axis = 1)
return df
# combining steps after resampling we have
def processing_step_2(df):
X, Y = split_and_drop_entire_DF(df)
high_outlier_data = ['Prothrombin', 'Tryglicerides', 'SGOT', 'Alk_Phos', 'Copper', 'Cholesterol', 'Bilirubin']
X = log_adjust_outliers(X, high_outlier_data)
X,indices = adjust_prothrombin(X)
Y = Y.drop(indices)
#X = Age_Categories(X)
#X = Year_Categories(X)
return X, Y
X_tmp, y_tmp = processing_step_2(OD_train)
Age_Categories and Year_Categories have been removed from the pre-processing steps as in initial model training it became clear that models trained on the numerical values N_Days and Age outperformed those using the binned categories.
# Pipeline
all_columns = X_tmp.columns
print(all_columns)
nominal_columns = ['Sex', 'Ascites', 'Hepatomegaly', 'Spiders', 'Edema', 'Drug']
#ordinal_columns = ['Stage', 'Years_Category', 'Age_Category'] # inline with our removal of Years_Category and Age_Category as features
ordinal_columns = ['Stage']
numerical_columns = list(set(all_columns)-set(ordinal_columns+nominal_columns))
ordinal_transformer = Pipeline(steps=[
('ordinal', OrdinalEncoder())
])
nominal_transformer = Pipeline(steps=[
('onehot', OneHotEncoder())
])
numerical_transformer = Pipeline(steps=[
('scaler', StandardScaler())
])
label_transformer = Pipeline([
('label transform', LabelEncoder()) #taking status as an ordinal variable - (D, CL, C) as order of 'unwellness'.
])
full_input_preprocessor = ColumnTransformer(transformers=[
('num', numerical_transformer, numerical_columns),
('ord', ordinal_transformer, ordinal_columns),
('nom', nominal_transformer, nominal_columns)
])
full_response_preprocessor = LabelEncoder()
Index(['N_Days', 'Drug', 'Age', 'Sex', 'Ascites', 'Hepatomegaly', 'Spiders',
'Edema', 'Bilirubin', 'Cholesterol', 'Albumin', 'Copper', 'Alk_Phos',
'SGOT', 'Tryglicerides', 'Platelets', 'Prothrombin', 'Stage'],
dtype='object')
# Preprocess response data
X_ready = pd.DataFrame(full_input_preprocessor.fit_transform(X_tmp),
columns=full_input_preprocessor.get_feature_names_out())
X = X_ready
# Preprocess response data
y_ready = full_response_preprocessor.fit_transform(y_tmp)
y = y_ready
def preprocess_data(data,resample='Smote',type='train'):
ID = data['id']
if type == 'train':
X_tmp, y_tmp = processing_step_2(data)
X = pd.DataFrame(full_input_preprocessor.fit_transform(X_tmp),
columns=full_input_preprocessor.get_feature_names_out())
y = full_response_preprocessor.fit_transform(y_tmp)
else:
X_tmp = data.drop('id',axis=1)
y = []
high_outlier_data = ['Prothrombin', 'Tryglicerides', 'SGOT', 'Alk_Phos', 'Copper', 'Cholesterol', 'Bilirubin']
X_tmp = log_adjust_outliers(X_tmp, high_outlier_data)
#X_tmp,indices = adjust_prothrombin(X_tmp)
X = pd.DataFrame(full_input_preprocessor.fit_transform(X_tmp),
columns=full_input_preprocessor.get_feature_names_out())
return X,y,ID
X, y, ID = preprocess_data(OD_train)
Random Forest classifiers can directly classify instances into multiple classes based on probability of each class. We are not concerned with OvR and OvO.
# initialise classifier
rm_classifier = RandomForestClassifier(n_estimators=100, min_samples_split=10,random_state=79,class_weight='balanced')
imba_rm = make_pipeline(SMOTE(random_state=5059), rm_classifier)
y_pred_proba = cross_val_predict(rm_classifier, X, y, cv=5, method='predict_proba')
y_pred = np.argmax(y_pred_proba, axis=1)
null_accuracy = y_tmp.value_counts(normalize=True)
# Evaluating the model (example using accuracy)
print(f'log loss: {log_loss(y, y_pred_proba)}')
print(f'Validation accuracy: {accuracy_score(y, y_pred)}')
print(f'Null Accuracy: {null_accuracy}')
log loss: 0.5007918801070789 Validation accuracy: 0.8233206590621039 Null Accuracy: C 0.628644 D 0.336502 CL 0.034854 Name: Status, dtype: float64
rm_classifier.fit(X,y)
trees = rm_classifier.estimators_
tree_depths = [tree.tree_.max_depth for tree in trees]
print(f"Mean tree depth: {np.mean(tree_depths)}")
print(f"Min tree depth: {np.min(tree_depths)}")
print(f"Max tree depth: {np.max(tree_depths)}")
Mean tree depth: 22.99 Min tree depth: 19 Max tree depth: 28
# Consider the prediction of each class against the rest, plot each precision-recall curve
def precision_recall_multiclass(classifier,X,y):
# create the validation set
X_train,X_valid,y_train,y_valid = train_test_split(X,y,test_size=0.2,random_state=79,stratify = y)
# use SMOTE to fit the classifier
X_train_resampled,y_train_resampled = SMOTE(random_state=5059).fit_resample(X_train,y_train)
rm_classifier.fit(X_train_resampled,y_train_resampled)
y_pred_prob_valid = classifier.predict_proba(X_valid)
y_pred_prob_train = classifier.predict_proba(X_train)
# Access the class labels
class_labels_after = classifier.classes_
class_labels_before = full_response_preprocessor.classes_
print(f'Before:{class_labels_before} \n After: {class_labels_after}')
for ii in range(3):
precision_train, recall_train, threshold_train = precision_recall_curve(y_train==ii,y_pred_prob_train[:,ii])
precision_valid, recall_valid, threshold_valid = precision_recall_curve(y_valid==ii,y_pred_prob_valid[:,ii])
plt.figure(figsize=(12,6))
plt.subplot(1,2,1)
plt.plot(recall_valid,precision_valid,'r--',label='valid')
plt.plot(recall_train,precision_train,'b-',label='train')
plt.legend()
plt.grid(True)
plt.title(f'Precision-Recall:{class_labels_before[ii]}')
plt.subplot(1,2,2)
plt.title(f'Precision-Recall-Threshold:{class_labels_before[ii]}')
plt.plot(threshold_valid,recall_valid[:-1],label='recall')
plt.plot(threshold_valid,precision_valid[:-1],label='precision')
plt.legend()
plt.grid(True)
precision_recall_multiclass(rm_classifier,X,y)
Before:['C' 'CL' 'D'] After: [0 1 2]
{0,1,2} = {'C','CL','D'}. Clearly over fitting for CL as it is synthesized data.
# function for plotting confusion matrix
def confusion_matrix_plot(conf_matrix,type):
# Define class labels
class_names = ['C', 'CL','D']
# Plot confusion matrix
plt.figure(figsize=(8, 6))
sns.set(font_scale=1.2) # Adjust font scale for better visualization
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues',
xticklabels=class_names, yticklabels=class_names)
plt.xlabel('Predicted labels')
plt.ylabel('True labels')
plt.title(f'Confusion Matrix of {type}')
plt.show()
def normalised_confusion_matrix_plot(conf_matrix, type):
class_names = ['C', 'CL','D']
class_totals = np.sum(conf_matrix, axis=1)
normalised_conf_matrix = np.zeros_like(conf_matrix, dtype=float)
for i in range(len(class_totals)):
if class_totals[i] != 0:
normalised_conf_matrix[i, :] = conf_matrix[i, :] / class_totals[i] * 100
plt.figure(figsize=(8, 6))
sns.set(font_scale=1.2) # Adjust font scale for better visualization
sns.heatmap(normalised_conf_matrix, annot=True, fmt='.1f', cmap='Reds',
xticklabels=class_names, yticklabels=class_names)
plt.xlabel('Predicted labels')
plt.ylabel('True labels')
plt.title(f'Normalised Confusion Matrix of {type}')
plt.show()
conf_matrix = confusion_matrix(y,y_pred)
confusion_matrix_plot(conf_matrix,'Random forest')
normalised_confusion_matrix_plot(conf_matrix,'Random forest')
Hyperparamater choices:
min_samples_split: By increasing the minimum number of samples required to split a node, we enforce a more stringent condition for node splitting, which can prevent the model from creating overly complex and specialized decision rules that capture noise in the training data, thus reducing overfitting.
n_estimators: Including fewer trees in the forest, indicated by lower values like 50, helps mitigate overfitting by reducing the model's capacity to memorize the training data. Conversely, larger values like 200 allow the model to learn more complex patterns but might lead to overfitting if not carefully tuned.
max_depth: Constraining the maximum depth of individual trees, such as with depths of 5 or 10, prevents the model from growing excessively deep trees that can perfectly fit the training data but generalize poorly to unseen data, thus controlling overfitting by limiting the complexity of the trees.
min_samples_leaf: Setting a higher minimum number of samples required to be at a leaf node, like 10 or 20, ensures that each leaf node represents a more generalized subset of the data, discouraging the model from creating overly specific and intricate decision paths that capture noise, thereby reducing overfitting.
# Define the parameter grid to search
param_grid = {
'min_samples_split': [30,50,120], # Minimum number of samples required to split an internal node
'n_estimators': [50, 100, 200], # Number of trees in the forest
'max_depth': [5,10,15], # Maximum depth of the tree
'min_samples_leaf': [10, 20, 30]# Minimum number of samples required to be at a leaf node
}
rm_classifier = RandomForestClassifier(class_weight='balanced',random_state=779,criterion='log_loss')
ba_rm_classifier = make_pipeline(SMOTE(random_state=5059), rm_classifier)
# Initialize GridSearchCV
new_params = {'randomforestclassifier__' + key: param_grid[key] for key in param_grid}
grid_search = GridSearchCV(ba_rm_classifier, new_params, cv=5, scoring='neg_log_loss')
grid_search.fit(X,y)
GridSearchCV(cv=5,
estimator=Pipeline(steps=[('smote', SMOTE(random_state=5059)),
('randomforestclassifier',
RandomForestClassifier(class_weight='balanced',
criterion='log_loss',
random_state=779))]),
param_grid={'randomforestclassifier__max_depth': [5, 10, 15],
'randomforestclassifier__min_samples_leaf': [10, 20,
30],
'randomforestclassifier__min_samples_split': [30, 50,
120],
'randomforestclassifier__n_estimators': [50, 100,
200]},
scoring='neg_log_loss')In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. GridSearchCV(cv=5,
estimator=Pipeline(steps=[('smote', SMOTE(random_state=5059)),
('randomforestclassifier',
RandomForestClassifier(class_weight='balanced',
criterion='log_loss',
random_state=779))]),
param_grid={'randomforestclassifier__max_depth': [5, 10, 15],
'randomforestclassifier__min_samples_leaf': [10, 20,
30],
'randomforestclassifier__min_samples_split': [30, 50,
120],
'randomforestclassifier__n_estimators': [50, 100,
200]},
scoring='neg_log_loss')Pipeline(steps=[('smote', SMOTE(random_state=5059)),
('randomforestclassifier',
RandomForestClassifier(class_weight='balanced',
criterion='log_loss',
random_state=779))])SMOTE(random_state=5059)
RandomForestClassifier(class_weight='balanced', criterion='log_loss',
random_state=779)# Best parameters and best score
print("Best parameters:", grid_search.best_params_)
# Best model
rm_classifier_tuned = grid_search.best_estimator_
# Predicting on the test set with the best found parameters
y_pred_best = cross_val_predict(rm_classifier_tuned, X, y, cv=5)
#y_pred_prob_best = cross_val_predict(rm_classifier_tuned, X, y, cv=5,method='predict_proba')
# Evaluating the model with the best found parameters
#print(f'log loss: {log_loss(y, y_pred_prob_best,labels= [0,1,2])}')
print(f'Test accuracy (best parameters): {accuracy_score(y, y_pred_best)}')
Best parameters: {'randomforestclassifier__max_depth': 15, 'randomforestclassifier__min_samples_leaf': 10, 'randomforestclassifier__min_samples_split': 30, 'randomforestclassifier__n_estimators': 100}
Test accuracy (best parameters): 0.7993662864385298
precision_recall_multiclass(rm_classifier_tuned,X,y)
Before:['C' 'CL' 'D'] After: [0 1 2]
Tuned model is less overfitting.
# Function for ploting feature importance scores
def feature_importance_plot(classifier,label):
# get the feature importance scores
feature_importances = classifier.named_steps['randomforestclassifier'].feature_importances_
feature_names = X.columns
importances_series = pd.Series(feature_importances, index=feature_names)
aggregated_importances = importances_series.groupby(lambda x: x.split('_')[2]).sum()
aggregated_importances = pd.DataFrame({'Scores':aggregated_importances})
# Sort the DataFrame by 'Scores' in descending order for better visualization
aggregated_importances_sorted = aggregated_importances.sort_values(by='Scores', ascending=False)
# Plotting
sns.set_style("whitegrid")
plt.figure(figsize=(10, 8)) # Adjust the figure size as needed
sns.barplot(x=aggregated_importances_sorted['Scores'], y=aggregated_importances_sorted.index,edgecolor='black')
plt.title(f'Feature Importance Scores from {label}') # You can customize the title
plt.xlabel('Importance Score') # X-axis label
plt.ylabel('Features') # Y-axis label
plt.show()
return aggregated_importances
feature_importance = feature_importance_plot(rm_classifier_tuned.fit(X,y),'Random Forest')
For high dimensional data, note that this is a projection of data onto 2-D plane.
print(feature_importance.sort_values(by = 'Scores',ascending=False).index.tolist())
['Bilirubin', 'Age', 'N', 'SGOT', 'Copper', 'Hepatomegaly', 'Prothrombin', 'Stage', 'Alk', 'Platelets', 'Cholesterol', 'Tryglicerides', 'Albumin', 'Drug', 'Spiders', 'Edema', 'Sex', 'Ascites']
# Select the most important two features
x1,x2 = 'num__Bilirubin','num__Age'
rm_classifier_tuned.fit(X,y)
# Setting the axis limits
x_min, x_max = X[x1].min() - 1, X[x1].max() + 1
y_min, y_max = X[x2].min() - 1, X[x2].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.01),
np.arange(y_min, y_max, 0.01))
prime_data = pd.DataFrame({x1:xx.ravel(),
x2:yy.ravel()})
# fake data
df_fake = pd.DataFrame()
# two way to fake it
mode_values = X.drop([x1,x2],axis=1).median(axis=0)
fake_data_otherdimension = pd.DataFrame(np.tile(mode_values,(xx.ravel().shape[0],1)),columns = mode_values.index)
#fake_data_otherdimension = pd.DataFrame(np.zeros((xx.ravel().shape[0],X_train_all.shape[1]-2)),columns = mode_values.index)
df_fake = pd.concat([prime_data,fake_data_otherdimension],axis=1)
df_fake = df_fake[X.columns]
# predict
Z = rm_classifier_tuned.predict(df_fake)
Z = Z.reshape(xx.shape)
Z_p = rm_classifier_tuned.predict_proba(df_fake)
Z_p = Z_p[:,1].reshape(xx.shape)
# Access the class labels
class_labels_after = rm_classifier_tuned.classes_
class_labels_before = full_response_preprocessor.classes_
print(f'Before:{class_labels_before},After: {class_labels_after}')
# plot the decision boundary
markers = ('s', 'x', 'o')
colors = ('red', 'blue', 'lightgreen')
cmap = ListedColormap(colors[:len(np.unique(y))])
#plt.contourf(xx, yy, Z_p, alpha=0.4,cmap='Set2')
plt.contourf(xx, yy, Z, alpha=0.5,cmap=cmap)
# Create patches for the legend
red_patch = mpatches.Patch(color='red', label='C')
blue_patch = mpatches.Patch(color='blue', label='CL')
green_patch = mpatches.Patch(color='lightgreen', label='D')
# Add legend for the decision regions
plt.legend(handles=[red_patch, blue_patch, green_patch])
indices = np.random.randint(0,X.shape[0],2000)
plt.scatter(X[x1][indices], X[x2][indices],c=y[indices],edgecolor= 'black',alpha=0.4,marker='.',cmap=cmap)
plt.xlabel('Bilirubin')
plt.ylabel('Ages')
plt.title('Decision Boundary of a Random Forest Classifier')
plt.show()
Before:['C' 'CL' 'D'],After: [0 1 2]
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X)
cum_explained_variance = np.cumsum(pca.explained_variance_ratio_)
print(f'Variance explained:{cum_explained_variance[-1]}')
Variance explained:0.37884640991155716
# Scatter plot of data
target_names = class_labels_before
markers = ('s', 'x', 'o')
colors = ('red', 'blue', 'lightgreen')
for idx, cl in enumerate(np.unique(y)):
plt.scatter(x= X.iloc[y == cl, 0], y= X.iloc[y == cl, 1],
alpha= 0.4, c= colors[idx],
marker= markers[idx], label= target_names[cl], edgecolor= 'black')
plt.xlabel('PC1'); plt.xticks([])
plt.ylabel('PC2'); plt.yticks([])
plt.title('2 components, captures {}% of total variation'.format(cum_explained_variance[1].round(4)*100))
plt.legend(loc="lower right")
plt.show()
/var/folders/55/l79kbh2923l1ckx00bwf_1l00000gn/T/ipykernel_33453/836239285.py:6: UserWarning: You passed a edgecolor/edgecolors ('black') for an unfilled marker ('x'). Matplotlib is ignoring the edgecolor in favor of the facecolor. This behavior may change in the future.
plt.scatter(x= X.iloc[y == cl, 0], y= X.iloc[y == cl, 1],
PCA may not be a good option since we two first PCs capture 40% variance and we need 8 PCs to capture 80% variance. But we could use it to visualise the decision boundary.
# Setting the axis limits
x_min, x_max = X_pca[:,0].min() - 1, X_pca[:,0].max() + 1
y_min, y_max = X_pca[:,1].min() - 1, X_pca[:,1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.01),
np.arange(y_min, y_max, 0.01))
df_fake_pca = pd.DataFrame({x1:xx.ravel(),
x2:yy.ravel()})
rm_classifier_pca = RandomForestClassifier(class_weight='balanced')
rm_classifier_pca.fit(X_pca,y)
# predict
Z = rm_classifier_pca.predict(df_fake_pca)
Z = Z.reshape(xx.shape)
Z_p = rm_classifier_pca.predict_proba(df_fake_pca)
Z_p = Z_p[:,1].reshape(xx.shape)
# Access the class labels
class_labels_after = rm_classifier_tuned.classes_
class_labels_before = full_response_preprocessor.classes_
print(f'Before:{class_labels_before},After: {class_labels_after}')
# plot the decision boundary
cmap = ListedColormap(colors[:len(np.unique(y))])
plt.contourf(xx, yy, Z, alpha=0.5,cmap=cmap)
indices = np.random.randint(0,X.shape[0],2000)# choose part of the data to plot to avoid overplotting
plt.scatter(X_pca[indices,0], X_pca[indices,1],c=y[indices],edgecolor='black',alpha=0.2,marker='.',cmap=cmap)
plt.xlabel('Bilirubin')
plt.ylabel('Ages')
plt.title('Decision Boundary in first two PCs')
plt.show()
/Users/del/miniconda3/envs/data_w/lib/python3.9/site-packages/sklearn/base.py:458: UserWarning: X has feature names, but RandomForestClassifier was fitted without feature names warnings.warn( /Users/del/miniconda3/envs/data_w/lib/python3.9/site-packages/sklearn/base.py:458: UserWarning: X has feature names, but RandomForestClassifier was fitted without feature names warnings.warn(
Before:['C' 'CL' 'D'],After: [0 1 2]
# initialse
sgd_classifier = SGDClassifier(max_iter=1000, tol=1e-3, random_state=79,class_weight='balanced',loss='log_loss')
# prediction with cross-validation using SMOTE (Change here for ADASYN or use no resampling techniques)
imba_sgd = make_pipeline(SMOTE(random_state=5059), sgd_classifier)
y_pred_proba = cross_val_predict(imba_sgd, X, y, cv=5, method='predict_proba')
y_pred = np.argmax(y_pred_proba, axis=1)
null_accuracy = y_tmp.value_counts(normalize=True)
print(f'log loss: {log_loss(y, y_pred_proba)}')
print(f'Validation accuracy: {accuracy_score(y, y_pred)}')
print(f'Null Accuracy: {null_accuracy}')
log loss: 0.7893865593482716 Validation accuracy: 0.6799746514575412 Null Accuracy: C 0.628644 D 0.336502 CL 0.034854 Name: Status, dtype: float64
confusion_matrix_plot(confusion_matrix(y,y_pred),'SGD')
normalised_confusion_matrix_plot(confusion_matrix(y,y_pred),'SGD')
First of all, the data The control parameter c should be large since the boudary is not very wide.
# Creating the SVM model
svc_classifier = SVC(kernel='poly', C=100,random_state=79,probability=True) # You can choose other kernels like 'rbf'
imba_svc = make_pipeline(SMOTE(random_state=5059), svc_classifier)
y_pred_prob = cross_val_predict(imba_svc,X, y,cv=5,method='predict_proba')
y_pred = cross_val_predict(svc_classifier,X, y,cv=5)
print(f'log loss: {log_loss(y, y_pred_prob)}')
print(f'Validation accuracy: {accuracy_score(y, y_pred)}')
log loss: 0.8145346883060302 Validation accuracy: 0.7552598225602027
print(classification_report(y, y_pred))
precision recall f1-score support
0 0.81 0.86 0.83 4960
1 0.12 0.14 0.13 275
2 0.72 0.62 0.67 2655
accuracy 0.76 7890
macro avg 0.55 0.54 0.54 7890
weighted avg 0.75 0.76 0.75 7890
confusion_matrix_plot(confusion_matrix(y,y_pred),'SVM')
normalised_confusion_matrix_plot(confusion_matrix(y,y_pred),'SVM')
# Define the model
model = Sequential([
Dense(32, activation='relu', input_shape=[X.shape[1]]),
Dense(64, activation='relu'),
Dense(32, activation='relu'),
Dense(3, activation='softmax')
])
model.compile(optimizer='adam',
loss='sparse_categorical_crossentropy',
metrics=['accuracy'])
callbacks = EarlyStopping(monitor='val_loss', patience=5)
model.summary()
Model: "sequential_18"
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
dense_72 (Dense) (None, 32) 704
dense_73 (Dense) (None, 64) 2112
dense_74 (Dense) (None, 32) 2080
dense_75 (Dense) (None, 3) 99
=================================================================
Total params: 4995 (19.51 KB)
Trainable params: 4995 (19.51 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
# create the validation set
X_train,X_valid,y_train,y_valid = train_test_split(X,y,test_size=0.2,random_state=79,stratify = y)
X_train.shape
(6312, 21)
np.unique(y_train)
array([0, 1, 2])
# Train the model
history = model.fit(X_train, y_train, epochs=100, validation_data=(X_valid, y_valid), callbacks = [callbacks])
Epoch 1/100 198/198 [==============================] - 3s 5ms/step - loss: 0.6029 - accuracy: 0.7687 - val_loss: 0.5226 - val_accuracy: 0.8004 Epoch 2/100 198/198 [==============================] - 1s 4ms/step - loss: 0.5150 - accuracy: 0.8029 - val_loss: 0.5106 - val_accuracy: 0.7991 Epoch 3/100 198/198 [==============================] - 1s 5ms/step - loss: 0.4972 - accuracy: 0.8032 - val_loss: 0.5039 - val_accuracy: 0.8016 Epoch 4/100 198/198 [==============================] - 1s 5ms/step - loss: 0.4898 - accuracy: 0.8094 - val_loss: 0.4969 - val_accuracy: 0.8048 Epoch 5/100 198/198 [==============================] - 1s 5ms/step - loss: 0.4775 - accuracy: 0.8094 - val_loss: 0.4936 - val_accuracy: 0.8035 Epoch 6/100 198/198 [==============================] - 1s 4ms/step - loss: 0.4712 - accuracy: 0.8143 - val_loss: 0.4907 - val_accuracy: 0.8035 Epoch 7/100 198/198 [==============================] - 1s 4ms/step - loss: 0.4629 - accuracy: 0.8181 - val_loss: 0.4954 - val_accuracy: 0.8061 Epoch 8/100 198/198 [==============================] - 1s 3ms/step - loss: 0.4554 - accuracy: 0.8243 - val_loss: 0.4970 - val_accuracy: 0.8042 Epoch 9/100 198/198 [==============================] - 1s 3ms/step - loss: 0.4511 - accuracy: 0.8246 - val_loss: 0.4944 - val_accuracy: 0.8099 Epoch 10/100 198/198 [==============================] - 1s 5ms/step - loss: 0.4445 - accuracy: 0.8281 - val_loss: 0.4928 - val_accuracy: 0.8112 Epoch 11/100 198/198 [==============================] - 1s 3ms/step - loss: 0.4374 - accuracy: 0.8292 - val_loss: 0.4929 - val_accuracy: 0.8105
# get confusion matrix for validation set
y_pred = model.predict(X_valid)
y_pred = np.argmax(y_pred, axis=1)
50/50 [==============================] - 1s 9ms/step
print(classification_report(y_valid, y_pred))
precision recall f1-score support
0 0.84 0.90 0.87 992
1 1.00 0.02 0.04 55
2 0.75 0.73 0.74 531
accuracy 0.81 1578
macro avg 0.86 0.55 0.55 1578
weighted avg 0.82 0.81 0.80 1578
conf_matrix = confusion_matrix(y_valid,y_pred)
confusion_matrix_plot(conf_matrix,'Neural Network')
normalised_confusion_matrix_plot(conf_matrix,'Neural Network')
pd.DataFrame(history.history).plot(figsize=(8, 5))
plt.grid(True)
plt.gca().set_ylim(0.35, 0.85)
(0.35, 0.85)
X, y, _ = preprocess_data(OD_train)
# create the validation set
X_train,X_valid,y_train,y_valid = train_test_split(X,y,test_size=0.2,random_state=79,stratify = y)
def model_built():
model = Sequential([
Input(shape = (X.shape[1],)),
Dense(32, activation='relu'),
Dense(64, activation='relu'),
Dense(32, activation='relu'),
Dense(3, activation='softmax')
])
model.compile(optimizer='adam',
loss='sparse_categorical_crossentropy',
metrics=['accuracy']) # metrics CategoricalCrossentropy
# Create a pipeline that first applies SMOTE and then feeds the data into the neural network
return model
earlystop = EarlyStopping(monitor='val_loss', patience=5)
checkpoint_cb = ModelCheckpoint("my_keras_model.keras",save_best_only=True)
# Train the model
history = model_built().fit(X, y,validation_split=0.2,epochs=50,callbacks=[earlystop,checkpoint_cb])
pd.DataFrame(history.history).plot(figsize=(8, 5))
plt.grid(True)
plt.gca().set_ylim(0, 1)
Epoch 1/50 198/198 ━━━━━━━━━━━━━━━━━━━━ 4s 15ms/step - accuracy: 0.6615 - loss: 0.7593 - val_accuracy: 0.7997 - val_loss: 0.5207 Epoch 2/50 198/198 ━━━━━━━━━━━━━━━━━━━━ 2s 12ms/step - accuracy: 0.8022 - loss: 0.5191 - val_accuracy: 0.8042 - val_loss: 0.5046 Epoch 3/50 198/198 ━━━━━━━━━━━━━━━━━━━━ 2s 12ms/step - accuracy: 0.8084 - loss: 0.4974 - val_accuracy: 0.8086 - val_loss: 0.5019 Epoch 4/50 198/198 ━━━━━━━━━━━━━━━━━━━━ 2s 12ms/step - accuracy: 0.8030 - loss: 0.4949 - val_accuracy: 0.8067 - val_loss: 0.4959 Epoch 5/50 198/198 ━━━━━━━━━━━━━━━━━━━━ 2s 12ms/step - accuracy: 0.8190 - loss: 0.4706 - val_accuracy: 0.8086 - val_loss: 0.4932 Epoch 6/50 198/198 ━━━━━━━━━━━━━━━━━━━━ 3s 13ms/step - accuracy: 0.8217 - loss: 0.4614 - val_accuracy: 0.8093 - val_loss: 0.4927 Epoch 7/50 198/198 ━━━━━━━━━━━━━━━━━━━━ 2s 12ms/step - accuracy: 0.8214 - loss: 0.4706 - val_accuracy: 0.8074 - val_loss: 0.4946 Epoch 8/50 198/198 ━━━━━━━━━━━━━━━━━━━━ 2s 12ms/step - accuracy: 0.8249 - loss: 0.4594 - val_accuracy: 0.8093 - val_loss: 0.4927 Epoch 9/50 198/198 ━━━━━━━━━━━━━━━━━━━━ 2s 12ms/step - accuracy: 0.8329 - loss: 0.4469 - val_accuracy: 0.8067 - val_loss: 0.4951 Epoch 10/50 198/198 ━━━━━━━━━━━━━━━━━━━━ 2s 12ms/step - accuracy: 0.8262 - loss: 0.4417 - val_accuracy: 0.8054 - val_loss: 0.4975 Epoch 11/50 198/198 ━━━━━━━━━━━━━━━━━━━━ 2s 13ms/step - accuracy: 0.8295 - loss: 0.4516 - val_accuracy: 0.8086 - val_loss: 0.4998
(0.0, 1.0)
# Wrap the model with KerasClassifier for compatibility with scikit-learn
neural_network = KerasClassifier(model=model_built, epochs=50, verbose=0,callbacks=[earlystop,checkpoint_cb])
imb_neural_network = IMBPipeline([
('smote', SMOTE(random_state=79)),
('nn', neural_network)
])
# Wrap the model with KerasClassifier for compatibility with scikit-learn
neural_network = KerasClassifier(model=model_built, epochs=100, verbose=0,callbacks=[earlystop,checkpoint_cb])
imb_neural_network = IMBPipeline([
('smote', SMOTE(random_state=79)),
('nn', neural_network)
])
neural_network
KerasClassifier( model=<function model_built at 0x2aa046f70> build_fn=None warm_start=False random_state=None optimizer=rmsprop loss=None metrics=None batch_size=None validation_batch_size=None verbose=0 callbacks=[<keras.src.callbacks.early_stopping.EarlyStopping object at 0x2e5b38850>, <keras.src.callbacks.model_checkpoint.ModelCheckpoint object at 0x2a8a69310>] validation_split=0.0 shuffle=True run_eagerly=False epochs=100 class_weight=None )In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
KerasClassifier( model=<function model_built at 0x2aa046f70> build_fn=None warm_start=False random_state=None optimizer=rmsprop loss=None metrics=None batch_size=None validation_batch_size=None verbose=0 callbacks=[<keras.src.callbacks.early_stopping.EarlyStopping object at 0x2e5b38850>, <keras.src.callbacks.model_checkpoint.ModelCheckpoint object at 0x2a8a69310>] validation_split=0.0 shuffle=True run_eagerly=False epochs=100 class_weight=None )
# VoteClassifier
X_train,X_valid,y_train,y_valid = train_test_split(X,y,test_size=0.2,random_state=79,stratify = y)
voting_classifier = VotingClassifier(estimators=[('rf', rm_classifier_tuned),
('svc',imba_svc),('sgd',imba_sgd)],
voting='hard',random_state=79)
y_pred = cross_val_predict(voting_classifier,X,y,cv=5,random_state=79)
accuracy_score(y,y_pred)
0.7816223067173638
for ii in (rm_classifier_tuned,imba_sgd,imba_svc):
y_pred1 = cross_val_predict(ii,X,y,cv=5)
print(accuracy_score(y,y_pred1))
0.7993662864385298 0.6920152091254753 0.7368821292775666
y_pred = voting_classifier.predict(X_valid)
accuracy_score(y_valid,y_pred)
0.7801013941698353
rm_classifier_tuned = RandomForestClassifier(max_depth=15, min_samples_leaf=10, min_samples_split=30, n_estimators=100, random_state=79)
y_pred = cross_val_predict(rm_classifier_tuned, X, y, cv=5)
y_pred_proba = cross_val_predict(rm_classifier_tuned, X, y, cv=5, method='predict_proba')
y_pred = np.argmax(y_pred_proba, axis=1)
null_accuracy = y_tmp.value_counts(normalize=True)
print(f'log loss: {log_loss(y, y_pred_proba)}')
print(f'Validation accuracy: {accuracy_score(y, y_pred)}')
print(f'Null Accuracy: {null_accuracy}')
log loss: 0.4761842933463752 Validation accuracy: 0.8171102661596958 Null Accuracy: C 0.628644 D 0.336502 CL 0.034854 Name: Status, dtype: float64
print(classification_report(y, y_pred))
precision recall f1-score support
0 0.83 0.92 0.87 4960
1 0.00 0.00 0.00 275
2 0.78 0.71 0.74 2655
accuracy 0.82 7890
macro avg 0.54 0.54 0.54 7890
weighted avg 0.79 0.82 0.80 7890
/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1471: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1471: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1471: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result))
conf_matrix = confusion_matrix(y,y_pred)
confusion_matrix_plot(conf_matrix,'Tuned Random Forest: Original Data')
normalised_confusion_matrix_plot(conf_matrix,'Tuned Random Forest: Original Data')
rm_classifier_tuned = RandomForestClassifier(max_depth=15, min_samples_leaf=10, min_samples_split=30, n_estimators=100, random_state=79)
imba_pipeline = make_pipeline(SMOTE(random_state=5059), rm_classifier_tuned)
y_pred = cross_val_predict(imba_pipeline, X, y, cv=5)
y_pred_proba = cross_val_predict(imba_pipeline, X, y, cv=5, method='predict_proba')
y_pred = np.argmax(y_pred_proba, axis=1)
null_accuracy = y_tmp.value_counts(normalize=True)
print(f'log loss: {log_loss(y, y_pred_proba)}')
print(f'Validation accuracy: {accuracy_score(y, y_pred)}')
print(f'Null Accuracy: {null_accuracy}')
log loss: 0.5558507908030657 Validation accuracy: 0.7978453738910013 Null Accuracy: C 0.628644 D 0.336502 CL 0.034854 Name: Status, dtype: float64
print(classification_report(y, y_pred))
precision recall f1-score support
0 0.87 0.84 0.86 4960
1 0.25 0.37 0.30 275
2 0.75 0.76 0.75 2655
accuracy 0.80 7890
macro avg 0.62 0.66 0.64 7890
weighted avg 0.81 0.80 0.80 7890
conf_matrix = confusion_matrix(y,y_pred)
confusion_matrix_plot(conf_matrix,'Tuned Random Forest: SMOTE')
normalised_confusion_matrix_plot(conf_matrix,'Tuned Random Forest: SMOTE')
rm_classifier_tuned = RandomForestClassifier(max_depth=15, min_samples_leaf=10, min_samples_split=30, n_estimators=100, random_state=79)
imba_pipeline = make_pipeline(ADASYN(random_state=5059), rm_classifier_tuned)
y_pred = cross_val_predict(imba_pipeline, X, y, cv=5)
y_pred_proba = cross_val_predict(imba_pipeline, X, y, cv=5, method='predict_proba')
y_pred = np.argmax(y_pred_proba, axis=1)
null_accuracy = y_tmp.value_counts(normalize=True)
print(f'log loss: {log_loss(y, y_pred_proba)}')
print(f'Validation accuracy: {accuracy_score(y, y_pred)}')
print(f'Null Accuracy: {null_accuracy}')
log loss: 0.5732224010201783 Validation accuracy: 0.7902408111533586 Null Accuracy: C 0.628644 D 0.336502 CL 0.034854 Name: Status, dtype: float64
print(classification_report(y, y_pred))
precision recall f1-score support
0 0.88 0.82 0.85 4960
1 0.24 0.35 0.29 275
2 0.72 0.78 0.75 2655
accuracy 0.79 7890
macro avg 0.61 0.65 0.63 7890
weighted avg 0.80 0.79 0.80 7890
conf_matrix = confusion_matrix(y,y_pred)
confusion_matrix_plot(conf_matrix,'Tuned Random Forest: ADAYSN')
normalised_confusion_matrix_plot(conf_matrix,'Tuned Random Forest: ADASYN')
smote = SMOTE(random_state=5059)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)
model = Sequential([
Dense(32, activation='relu', input_shape=[X_train_smote.shape[1]]),
Dense(64, activation='relu'),
Dense(32, activation='relu'),
Dense(3, activation='softmax')
])
model.compile(optimizer='adam',
loss='sparse_categorical_crossentropy',
metrics=['accuracy'])
callbacks = EarlyStopping(monitor='val_loss', patience=10)
/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/threadpoolctl.py:1019: RuntimeWarning: libc not found. The ctypes module in Python 3.9 is maybe too old for this OS. warnings.warn(
history = model.fit(X_train_smote, y_train_smote, epochs=100, validation_data=(X_valid, y_valid), callbacks = [callbacks])
Epoch 1/100 372/372 [==============================] - 3s 5ms/step - loss: 0.7602 - accuracy: 0.6704 - val_loss: 0.6659 - val_accuracy: 0.7288 Epoch 2/100 372/372 [==============================] - 2s 4ms/step - loss: 0.6260 - accuracy: 0.7469 - val_loss: 0.6691 - val_accuracy: 0.7053 Epoch 3/100 372/372 [==============================] - 1s 3ms/step - loss: 0.5420 - accuracy: 0.7869 - val_loss: 0.6110 - val_accuracy: 0.7440 Epoch 4/100 372/372 [==============================] - 1s 3ms/step - loss: 0.4788 - accuracy: 0.8165 - val_loss: 0.6418 - val_accuracy: 0.7256 Epoch 5/100 372/372 [==============================] - 2s 4ms/step - loss: 0.4335 - accuracy: 0.8325 - val_loss: 0.6419 - val_accuracy: 0.7408 Epoch 6/100 372/372 [==============================] - 1s 3ms/step - loss: 0.3946 - accuracy: 0.8469 - val_loss: 0.6968 - val_accuracy: 0.7446 Epoch 7/100 372/372 [==============================] - 1s 4ms/step - loss: 0.3693 - accuracy: 0.8575 - val_loss: 0.6322 - val_accuracy: 0.7719 Epoch 8/100 372/372 [==============================] - 1s 4ms/step - loss: 0.3462 - accuracy: 0.8658 - val_loss: 0.6733 - val_accuracy: 0.7598 Epoch 9/100 372/372 [==============================] - 1s 4ms/step - loss: 0.3295 - accuracy: 0.8707 - val_loss: 0.6739 - val_accuracy: 0.7674 Epoch 10/100 372/372 [==============================] - 1s 3ms/step - loss: 0.3128 - accuracy: 0.8785 - val_loss: 0.7005 - val_accuracy: 0.7579 Epoch 11/100 372/372 [==============================] - 1s 3ms/step - loss: 0.3017 - accuracy: 0.8831 - val_loss: 0.6955 - val_accuracy: 0.7814 Epoch 12/100 372/372 [==============================] - 2s 6ms/step - loss: 0.2895 - accuracy: 0.8860 - val_loss: 0.7176 - val_accuracy: 0.7643 Epoch 13/100 372/372 [==============================] - 2s 4ms/step - loss: 0.2788 - accuracy: 0.8903 - val_loss: 0.7599 - val_accuracy: 0.7687
# get confusion matrix for validation set
y_pred = model.predict(X_valid)
y_pred = np.argmax(y_pred, axis=1)
50/50 [==============================] - 0s 2ms/step
print(classification_report(y_valid, y_pred))
precision recall f1-score support
0 0.88 0.79 0.83 992
1 0.17 0.27 0.21 55
2 0.70 0.78 0.74 531
accuracy 0.77 1578
macro avg 0.58 0.61 0.59 1578
weighted avg 0.79 0.77 0.78 1578
conf_matrix = confusion_matrix(y_valid,y_pred)
confusion_matrix_plot(conf_matrix,'Neural Network: SMOTE')
normalised_confusion_matrix_plot(conf_matrix,'Neural Network: SMOTE')
train_copy = OD_train.copy()
train_copy = df_strings_to_numbers(train_copy)
train_copy['Status'] = train_copy['Status'].replace({0:'D', 1: 'CL', 2:'C'})
train_copy['Status'] = train_copy['Status'].replace({'C': 0, 'CL':1, 'D': 2})
y_OS = train_copy['Status']
X_OS = train_copy.drop('Status', axis=1)
X_OS = X_OS.drop('id', axis=1)
X_train, X_val, y_train, y_val = train_test_split(X_OS,y_OS,test_size=0.2,random_state=79, stratify=y_OS)
smote = SMOTE(random_state=5059)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)
print(y_OS.value_counts(normalize=True))
concat_train = pd.concat([X_train_smote,y_train_smote],axis=1)
concat_val = pd.concat([X_val,y_val],axis=1)
X_tmp_train, y_tmp_train = processing_step_2(concat_train)
X_tmp_val, y_tmp_val = processing_step_2(concat_val)
X_tmp_train['Stage'] = X_tmp_train['Stage'].round(0)
X_tmp_val['Stage'] = X_tmp_val['Stage'].round(0)
X_ready_train = pd.DataFrame(full_input_preprocessor.fit_transform(X_tmp_train), columns=full_input_preprocessor.get_feature_names_out())
X_ready_val = pd.DataFrame(full_input_preprocessor.transform(X_tmp_val), columns=full_input_preprocessor.get_feature_names_out())
y_ready_train = full_response_preprocessor.fit_transform(y_tmp_train)
y_ready_val = full_response_preprocessor.transform(y_tmp_val)
0 0.628083 2 0.337128 1 0.034788 Name: Status, dtype: float64
print(X_ready_train.shape)
model = Sequential([
Dense(32, activation='relu', input_shape=[X_ready_train.shape[1]]),
Dense(64, activation='relu'),
Dense(32, activation='relu'),
Dense(3, activation='softmax')
])
model.compile(optimizer='adam',
loss='sparse_categorical_crossentropy',
metrics=['accuracy'])
callbacks = EarlyStopping(monitor='val_loss', patience=10)
(11897, 21)
history = model.fit(X_ready_train, y_ready_train, epochs=100, validation_data=(X_ready_val, y_ready_val), callbacks = [callbacks])
Epoch 1/100 372/372 [==============================] - 3s 4ms/step - loss: 0.7410 - accuracy: 0.6831 - val_loss: 0.6883 - val_accuracy: 0.7064 Epoch 2/100 372/372 [==============================] - 1s 3ms/step - loss: 0.6100 - accuracy: 0.7564 - val_loss: 0.6335 - val_accuracy: 0.7368 Epoch 3/100 372/372 [==============================] - 2s 4ms/step - loss: 0.5578 - accuracy: 0.7804 - val_loss: 0.6598 - val_accuracy: 0.7368 Epoch 4/100 372/372 [==============================] - 2s 5ms/step - loss: 0.5240 - accuracy: 0.7952 - val_loss: 0.6628 - val_accuracy: 0.7349 Epoch 5/100 372/372 [==============================] - 1s 3ms/step - loss: 0.4957 - accuracy: 0.8078 - val_loss: 0.5858 - val_accuracy: 0.7793 Epoch 6/100 372/372 [==============================] - 1s 2ms/step - loss: 0.4697 - accuracy: 0.8207 - val_loss: 0.6834 - val_accuracy: 0.7311 Epoch 7/100 372/372 [==============================] - 2s 4ms/step - loss: 0.4528 - accuracy: 0.8280 - val_loss: 0.6581 - val_accuracy: 0.7464 Epoch 8/100 372/372 [==============================] - 2s 5ms/step - loss: 0.4337 - accuracy: 0.8339 - val_loss: 0.6864 - val_accuracy: 0.7425 Epoch 9/100 372/372 [==============================] - 1s 3ms/step - loss: 0.4181 - accuracy: 0.8396 - val_loss: 0.6649 - val_accuracy: 0.7546 Epoch 10/100 372/372 [==============================] - 2s 7ms/step - loss: 0.3995 - accuracy: 0.8490 - val_loss: 0.6681 - val_accuracy: 0.7666 Epoch 11/100 372/372 [==============================] - 2s 4ms/step - loss: 0.3883 - accuracy: 0.8519 - val_loss: 0.7244 - val_accuracy: 0.7438 Epoch 12/100 372/372 [==============================] - 1s 3ms/step - loss: 0.3764 - accuracy: 0.8562 - val_loss: 0.7109 - val_accuracy: 0.7584 Epoch 13/100 372/372 [==============================] - 1s 3ms/step - loss: 0.3650 - accuracy: 0.8603 - val_loss: 0.7245 - val_accuracy: 0.7438 Epoch 14/100 372/372 [==============================] - 2s 5ms/step - loss: 0.3533 - accuracy: 0.8651 - val_loss: 0.6926 - val_accuracy: 0.7749 Epoch 15/100 372/372 [==============================] - 1s 3ms/step - loss: 0.3442 - accuracy: 0.8674 - val_loss: 0.7517 - val_accuracy: 0.7483
# get confusion matrix for validation set
y_pred = model.predict(X_ready_val)
y_pred = np.argmax(y_pred, axis=1)
50/50 [==============================] - 0s 2ms/step
print(classification_report(y_ready_val, y_pred))
precision recall f1-score support
0 0.88 0.78 0.82 992
1 0.12 0.29 0.17 55
2 0.69 0.75 0.72 530
accuracy 0.75 1577
macro avg 0.56 0.60 0.57 1577
weighted avg 0.79 0.75 0.77 1577
conf_matrix = confusion_matrix(y_ready_val,y_pred)
confusion_matrix_plot(conf_matrix,'Neural Network: SMOTE')
normalised_confusion_matrix_plot(conf_matrix,'Neural Network: SMOTE')
adaysn = ADASYN(random_state=5059)
X_train_adasyn, y_train_adasyn = adaysn.fit_resample(X_train, y_train)
model = Sequential([
Dense(32, activation='relu', input_shape=[X_train_adasyn.shape[1]]),
Dense(64, activation='relu'),
Dense(32, activation='relu'),
Dense(3, activation='softmax')
])
model.compile(optimizer='adam',
loss='sparse_categorical_crossentropy',
metrics=['accuracy'])
callbacks = EarlyStopping(monitor='val_loss', patience=10)
history = model.fit(X_train_adasyn, y_train_adasyn, epochs=100, validation_data=(X_valid, y_valid), callbacks = [callbacks])
Epoch 1/100 361/361 [==============================] - 2s 4ms/step - loss: 0.8263 - accuracy: 0.6283 - val_loss: 0.7421 - val_accuracy: 0.6857 Epoch 2/100 361/361 [==============================] - 1s 3ms/step - loss: 0.6999 - accuracy: 0.7029 - val_loss: 0.6732 - val_accuracy: 0.7224 Epoch 3/100 361/361 [==============================] - 1s 2ms/step - loss: 0.6139 - accuracy: 0.7476 - val_loss: 0.6867 - val_accuracy: 0.7212 Epoch 4/100 361/361 [==============================] - 1s 4ms/step - loss: 0.5428 - accuracy: 0.7773 - val_loss: 0.6563 - val_accuracy: 0.7446 Epoch 5/100 361/361 [==============================] - 1s 4ms/step - loss: 0.4964 - accuracy: 0.8005 - val_loss: 0.7301 - val_accuracy: 0.7294 Epoch 6/100 361/361 [==============================] - 2s 4ms/step - loss: 0.4532 - accuracy: 0.8196 - val_loss: 0.7275 - val_accuracy: 0.7389 Epoch 7/100 361/361 [==============================] - 1s 4ms/step - loss: 0.4240 - accuracy: 0.8289 - val_loss: 0.7558 - val_accuracy: 0.7421 Epoch 8/100 361/361 [==============================] - 1s 3ms/step - loss: 0.4055 - accuracy: 0.8371 - val_loss: 0.7434 - val_accuracy: 0.7510 Epoch 9/100 361/361 [==============================] - 1s 2ms/step - loss: 0.3866 - accuracy: 0.8420 - val_loss: 0.7436 - val_accuracy: 0.7605 Epoch 10/100 361/361 [==============================] - 1s 3ms/step - loss: 0.3665 - accuracy: 0.8505 - val_loss: 0.8077 - val_accuracy: 0.7351 Epoch 11/100 361/361 [==============================] - 1s 3ms/step - loss: 0.3546 - accuracy: 0.8523 - val_loss: 0.7901 - val_accuracy: 0.7586 Epoch 12/100 361/361 [==============================] - 2s 5ms/step - loss: 0.3447 - accuracy: 0.8571 - val_loss: 0.8049 - val_accuracy: 0.7630 Epoch 13/100 361/361 [==============================] - 1s 3ms/step - loss: 0.3325 - accuracy: 0.8627 - val_loss: 0.8478 - val_accuracy: 0.7402 Epoch 14/100 361/361 [==============================] - 1s 3ms/step - loss: 0.3228 - accuracy: 0.8669 - val_loss: 0.8341 - val_accuracy: 0.7662
# get confusion matrix for validation set
y_pred = model.predict(X_valid)
y_pred = np.argmax(y_pred, axis=1)
conf_matrix = confusion_matrix(y_valid,y_pred)
confusion_matrix_plot(conf_matrix,'Neural Network: ADASYN')
50/50 [==============================] - 0s 3ms/step
normalised_confusion_matrix_plot(conf_matrix,'Neural Network: ADASYN')
train_copy = OD_train.copy()
train_copy = df_strings_to_numbers(train_copy)
train_copy['Status'] = train_copy['Status'].replace({0:'D', 1: 'CL', 2:'C'})
train_copy['Status'] = train_copy['Status'].replace({'C': 0, 'CL':1, 'D': 2})
y_OS = train_copy['Status']
X_OS = train_copy.drop('Status', axis=1)
X_OS = X_OS.drop('id', axis=1)
X_train, X_val, y_train, y_val = train_test_split(X_OS,y_OS,test_size=0.2,random_state=79, stratify=y_OS)
adasyn = ADASYN(random_state=5059)
X_train_adasyn, y_train_adasyn = adasyn.fit_resample(X_train, y_train)
print(y_OS.value_counts(normalize=True))
concat_train = pd.concat([X_train_adasyn,y_train_adasyn],axis=1)
concat_val = pd.concat([X_val,y_val],axis=1)
X_tmp_train, y_tmp_train = processing_step_2(concat_train)
X_tmp_val, y_tmp_val = processing_step_2(concat_val)
X_tmp_train['Stage'] = X_tmp_train['Stage'].round(0)
X_tmp_val['Stage'] = X_tmp_val['Stage'].round(0)
X_ready_train = pd.DataFrame(full_input_preprocessor.fit_transform(X_tmp_train), columns=full_input_preprocessor.get_feature_names_out())
X_ready_val = pd.DataFrame(full_input_preprocessor.transform(X_tmp_val), columns=full_input_preprocessor.get_feature_names_out())
y_ready_train = full_response_preprocessor.fit_transform(y_tmp_train)
y_ready_val = full_response_preprocessor.transform(y_tmp_val)
0 0.628083 2 0.337128 1 0.034788 Name: Status, dtype: float64
/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/threadpoolctl.py:1019: RuntimeWarning: libc not found. The ctypes module in Python 3.9 is maybe too old for this OS. warnings.warn(
print(X_ready_train.shape)
model = Sequential([
Dense(32, activation='relu', input_shape=[X_ready_train.shape[1]]),
Dense(64, activation='relu'),
Dense(32, activation='relu'),
Dense(3, activation='softmax')
])
model.compile(optimizer='adam',
loss='sparse_categorical_crossentropy',
metrics=['accuracy'])
callbacks = EarlyStopping(monitor='val_loss', patience=10)
(12037, 21)
history = model.fit(X_ready_train, y_ready_train, epochs=100, validation_data=(X_ready_val, y_ready_val), callbacks = [callbacks])
Epoch 1/100 377/377 [==============================] - 2s 3ms/step - loss: 0.7843 - accuracy: 0.6610 - val_loss: 0.7048 - val_accuracy: 0.6937 Epoch 2/100 377/377 [==============================] - 1s 2ms/step - loss: 0.6653 - accuracy: 0.7287 - val_loss: 0.6551 - val_accuracy: 0.7178 Epoch 3/100 377/377 [==============================] - 1s 2ms/step - loss: 0.6101 - accuracy: 0.7537 - val_loss: 0.6107 - val_accuracy: 0.7445 Epoch 4/100 377/377 [==============================] - 1s 2ms/step - loss: 0.5672 - accuracy: 0.7730 - val_loss: 0.6869 - val_accuracy: 0.7013 Epoch 5/100 377/377 [==============================] - 1s 2ms/step - loss: 0.5329 - accuracy: 0.7889 - val_loss: 0.6295 - val_accuracy: 0.7508 Epoch 6/100 377/377 [==============================] - 1s 2ms/step - loss: 0.5013 - accuracy: 0.8018 - val_loss: 0.6178 - val_accuracy: 0.7527 Epoch 7/100 377/377 [==============================] - 1s 2ms/step - loss: 0.4782 - accuracy: 0.8126 - val_loss: 0.6338 - val_accuracy: 0.7584 Epoch 8/100 377/377 [==============================] - 1s 2ms/step - loss: 0.4568 - accuracy: 0.8215 - val_loss: 0.6509 - val_accuracy: 0.7508 Epoch 9/100 377/377 [==============================] - 1s 2ms/step - loss: 0.4401 - accuracy: 0.8258 - val_loss: 0.6663 - val_accuracy: 0.7571 Epoch 10/100 377/377 [==============================] - 1s 2ms/step - loss: 0.4249 - accuracy: 0.8318 - val_loss: 0.6984 - val_accuracy: 0.7457 Epoch 11/100 377/377 [==============================] - 1s 3ms/step - loss: 0.4100 - accuracy: 0.8382 - val_loss: 0.7131 - val_accuracy: 0.7375 Epoch 12/100 377/377 [==============================] - 1s 2ms/step - loss: 0.3991 - accuracy: 0.8389 - val_loss: 0.7063 - val_accuracy: 0.7521 Epoch 13/100 377/377 [==============================] - 1s 3ms/step - loss: 0.3866 - accuracy: 0.8476 - val_loss: 0.7442 - val_accuracy: 0.7394
# get confusion matrix for validation set
y_pred = model.predict(X_ready_val)
y_pred = np.argmax(y_pred, axis=1)
50/50 [==============================] - 0s 1ms/step
print(classification_report(y_ready_val, y_pred))
precision recall f1-score support
0 0.88 0.75 0.81 992
1 0.08 0.16 0.11 55
2 0.67 0.78 0.72 530
accuracy 0.74 1577
macro avg 0.54 0.56 0.55 1577
weighted avg 0.78 0.74 0.75 1577
conf_matrix = confusion_matrix(y_ready_val,y_pred)
confusion_matrix_plot(conf_matrix,'Neural Network: ADASYN')
normalised_confusion_matrix_plot(conf_matrix,'Neural Network: ADASYN')
# prepare test data
X_test,y_test, ID = preprocess_data(OD_test,type = 'test')
y_test_predproba = rm_classifier_tuned.named_steps['randomforestclassifier'].predict_proba(X_test)
kaggle_set_rm = pd.DataFrame(y_test_predproba,index = ID, columns = ['Status_C','Status_CL','Status_D'])
# Submission
kaggle_set_rm.to_csv('data/kaggle_set.csv')
train_file = './data/train.csv'
train = pd.read_csv(train_file)
nan_train = train.copy(deep=True)
nan_train.loc[nan_train.sample(frac=0.2, random_state=1).index, 'N_Days'] = np.nan
nan_train.loc[nan_train.sample(frac=0.2, random_state=2).index, 'Bilirubin'] = np.nan
nan_train.loc[nan_train.sample(frac=0.2, random_state=3).index, 'Stage'] = np.nan
nan_train.loc[nan_train.sample(frac=0.2, random_state=4).index, 'Edema'] = np.nan
nan_train[nan_train['N_Days'].isnull()].head(5)
| id | N_Days | Drug | Age | Sex | Ascites | Hepatomegaly | Spiders | Edema | Bilirubin | Cholesterol | Albumin | Copper | Alk_Phos | SGOT | Tryglicerides | Platelets | Prothrombin | Stage | Status | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 7 | 7 | NaN | Placebo | 21281 | F | N | Y | N | N | 0.6 | 273.0 | 3.94 | 36.0 | 598.0 | 52.70 | 214.0 | 227.0 | 9.9 | 3.0 | C |
| 25 | 25 | NaN | Placebo | 27220 | F | N | Y | N | N | 1.3 | 325.0 | 3.60 | 81.0 | 2065.0 | 232.50 | 100.0 | 277.0 | 11.1 | NaN | C |
| 28 | 28 | NaN | D-penicillamine | 15105 | F | N | Y | Y | N | NaN | 528.0 | 3.34 | 173.0 | 1282.0 | 120.90 | 55.0 | 123.0 | 10.7 | 4.0 | D |
| 33 | 33 | NaN | Placebo | 20254 | F | N | N | N | NaN | 3.9 | 373.0 | 3.67 | 281.0 | 754.0 | 130.20 | 89.0 | 277.0 | 11.6 | NaN | D |
| 34 | 34 | NaN | Placebo | 23235 | F | Y | Y | Y | Y | 1.3 | 178.0 | 2.93 | 188.0 | 1020.0 | 119.35 | 178.0 | 136.0 | 11.5 | NaN | D |
nan_train[nan_train['Edema'].isnull()].head(5)
| id | N_Days | Drug | Age | Sex | Ascites | Hepatomegaly | Spiders | Edema | Bilirubin | Cholesterol | Albumin | Copper | Alk_Phos | SGOT | Tryglicerides | Platelets | Prothrombin | Stage | Status | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1 | 1 | 2574.0 | Placebo | 19237 | F | N | N | N | NaN | 0.9 | 364.0 | 3.54 | 63.0 | 1440.0 | 134.85 | 88.0 | 361.0 | 11.0 | 3.0 | C |
| 3 | 3 | 2576.0 | Placebo | 18460 | F | N | N | N | NaN | NaN | 256.0 | 3.50 | 58.0 | 1653.0 | 71.30 | 96.0 | 269.0 | 10.7 | 3.0 | C |
| 11 | 11 | 1614.0 | Placebo | 14106 | F | N | N | N | NaN | 0.9 | 328.0 | 3.61 | 62.0 | 1105.0 | 137.95 | 95.0 | 145.0 | 9.5 | 3.0 | C |
| 13 | 13 | 1153.0 | D-penicillamine | 22347 | F | N | Y | N | NaN | 0.6 | 232.0 | 3.83 | 24.0 | 678.0 | 65.10 | 99.0 | 248.0 | 10.4 | 3.0 | C |
| 19 | 19 | 3358.0 | Placebo | 17031 | F | N | N | N | NaN | 0.6 | 322.0 | 3.77 | 52.0 | 834.0 | 60.45 | 214.0 | 153.0 | 11.0 | 3.0 | C |
nan_train[nan_train['Bilirubin'].isnull()].head(5)
| id | N_Days | Drug | Age | Sex | Ascites | Hepatomegaly | Spiders | Edema | Bilirubin | Cholesterol | Albumin | Copper | Alk_Phos | SGOT | Tryglicerides | Platelets | Prothrombin | Stage | Status | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 3 | 3 | 2576.0 | Placebo | 18460 | F | N | N | N | NaN | NaN | 256.0 | 3.50 | 58.0 | 1653.0 | 71.30 | 96.0 | 269.0 | 10.7 | 3.0 | C |
| 4 | 4 | 788.0 | Placebo | 16658 | F | N | Y | N | N | NaN | 346.0 | 3.65 | 63.0 | 1181.0 | 125.55 | 96.0 | 298.0 | 10.6 | 4.0 | C |
| 15 | 15 | 1212.0 | Placebo | 15112 | F | N | N | N | N | NaN | 335.0 | 3.54 | 44.0 | 1345.0 | 137.95 | 145.0 | 244.0 | 10.6 | 3.0 | C |
| 20 | 20 | 3092.0 | Placebo | 15612 | F | N | Y | N | NaN | NaN | 303.0 | 3.10 | 70.0 | 1032.0 | 56.76 | 154.0 | 336.0 | 10.6 | NaN | C |
| 27 | 27 | 4467.0 | D-penicillamine | 12398 | F | N | Y | N | N | NaN | 414.0 | 3.43 | 41.0 | 876.0 | 84.00 | 110.0 | 385.0 | 11.0 | 3.0 | C |
nan_train[nan_train['Stage'].isnull()].head(5)
| id | N_Days | Drug | Age | Sex | Ascites | Hepatomegaly | Spiders | Edema | Bilirubin | Cholesterol | Albumin | Copper | Alk_Phos | SGOT | Tryglicerides | Platelets | Prothrombin | Stage | Status | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 999.0 | D-penicillamine | 21532 | M | N | N | N | N | 2.3 | 316.0 | 3.35 | 172.0 | 1601.0 | 179.80 | 63.0 | 394.0 | 9.7 | NaN | D |
| 10 | 10 | 3581.0 | Placebo | 25772 | F | N | N | N | N | 0.5 | 252.0 | 3.60 | 26.0 | 377.0 | 56.76 | 185.0 | 336.0 | 10.0 | NaN | C |
| 17 | 17 | 1592.0 | D-penicillamine | 14872 | F | N | Y | N | N | 1.1 | 392.0 | 3.43 | 39.0 | 1395.0 | 184.45 | 133.0 | 328.0 | 11.2 | NaN | C |
| 20 | 20 | 3092.0 | Placebo | 15612 | F | N | Y | N | NaN | NaN | 303.0 | 3.10 | 70.0 | 1032.0 | 56.76 | 154.0 | 336.0 | 10.6 | NaN | C |
| 23 | 23 | 1152.0 | D-penicillamine | 16736 | F | N | Y | N | N | 1.1 | 373.0 | 3.90 | 69.0 | 1353.0 | 116.25 | 139.0 | 268.0 | 10.0 | NaN | C |
nan_train.to_csv('train_with_missing_vals.csv', index=False)
#read in file from column Dropper
nan_train_file = './train_with_missing_vals.csv'
nan_train = pd.read_csv(nan_train_file)
train_simputer = train.copy(deep=True)
View NaN values
train_simputer.isna().sum()
id 0 N_Days 0 Drug 0 Age 0 Sex 0 Ascites 0 Hepatomegaly 0 Spiders 0 Edema 0 Bilirubin 0 Cholesterol 0 Albumin 0 Copper 0 Alk_Phos 0 SGOT 0 Tryglicerides 0 Platelets 0 Prothrombin 0 Stage 0 Status 0 dtype: int64
View first five NaN for N_Days:
nan_train[nan_train['N_Days'].isnull()].head(5)
| id | N_Days | Drug | Age | Sex | Ascites | Hepatomegaly | Spiders | Edema | Bilirubin | Cholesterol | Albumin | Copper | Alk_Phos | SGOT | Tryglicerides | Platelets | Prothrombin | Stage | Status | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 7 | 7 | NaN | Placebo | 21281 | F | N | Y | N | N | 0.6 | 273.0 | 3.94 | 36.0 | 598.0 | 52.70 | 214.0 | 227.0 | 9.9 | 3.0 | C |
| 25 | 25 | NaN | Placebo | 27220 | F | N | Y | N | N | 1.3 | 325.0 | 3.60 | 81.0 | 2065.0 | 232.50 | 100.0 | 277.0 | 11.1 | NaN | C |
| 28 | 28 | NaN | D-penicillamine | 15105 | F | N | Y | Y | N | NaN | 528.0 | 3.34 | 173.0 | 1282.0 | 120.90 | 55.0 | 123.0 | 10.7 | 4.0 | D |
| 33 | 33 | NaN | Placebo | 20254 | F | N | N | N | NaN | 3.9 | 373.0 | 3.67 | 281.0 | 754.0 | 130.20 | 89.0 | 277.0 | 11.6 | NaN | D |
| 34 | 34 | NaN | Placebo | 23235 | F | Y | Y | Y | Y | 1.3 | 178.0 | 2.93 | 188.0 | 1020.0 | 119.35 | 178.0 | 136.0 | 11.5 | NaN | D |
Edema
nan_train[nan_train['Edema'].isnull()].head(5)
| id | N_Days | Drug | Age | Sex | Ascites | Hepatomegaly | Spiders | Edema | Bilirubin | Cholesterol | Albumin | Copper | Alk_Phos | SGOT | Tryglicerides | Platelets | Prothrombin | Stage | Status | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1 | 1 | 2574.0 | Placebo | 19237 | F | N | N | N | NaN | 0.9 | 364.0 | 3.54 | 63.0 | 1440.0 | 134.85 | 88.0 | 361.0 | 11.0 | 3.0 | C |
| 3 | 3 | 2576.0 | Placebo | 18460 | F | N | N | N | NaN | NaN | 256.0 | 3.50 | 58.0 | 1653.0 | 71.30 | 96.0 | 269.0 | 10.7 | 3.0 | C |
| 11 | 11 | 1614.0 | Placebo | 14106 | F | N | N | N | NaN | 0.9 | 328.0 | 3.61 | 62.0 | 1105.0 | 137.95 | 95.0 | 145.0 | 9.5 | 3.0 | C |
| 13 | 13 | 1153.0 | D-penicillamine | 22347 | F | N | Y | N | NaN | 0.6 | 232.0 | 3.83 | 24.0 | 678.0 | 65.10 | 99.0 | 248.0 | 10.4 | 3.0 | C |
| 19 | 19 | 3358.0 | Placebo | 17031 | F | N | N | N | NaN | 0.6 | 322.0 | 3.77 | 52.0 | 834.0 | 60.45 | 214.0 | 153.0 | 11.0 | 3.0 | C |
Bilirubin
nan_train[nan_train['Bilirubin'].isnull()].head(5)
| id | N_Days | Drug | Age | Sex | Ascites | Hepatomegaly | Spiders | Edema | Bilirubin | Cholesterol | Albumin | Copper | Alk_Phos | SGOT | Tryglicerides | Platelets | Prothrombin | Stage | Status | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 3 | 3 | 2576.0 | Placebo | 18460 | F | N | N | N | NaN | NaN | 256.0 | 3.50 | 58.0 | 1653.0 | 71.30 | 96.0 | 269.0 | 10.7 | 3.0 | C |
| 4 | 4 | 788.0 | Placebo | 16658 | F | N | Y | N | N | NaN | 346.0 | 3.65 | 63.0 | 1181.0 | 125.55 | 96.0 | 298.0 | 10.6 | 4.0 | C |
| 15 | 15 | 1212.0 | Placebo | 15112 | F | N | N | N | N | NaN | 335.0 | 3.54 | 44.0 | 1345.0 | 137.95 | 145.0 | 244.0 | 10.6 | 3.0 | C |
| 20 | 20 | 3092.0 | Placebo | 15612 | F | N | Y | N | NaN | NaN | 303.0 | 3.10 | 70.0 | 1032.0 | 56.76 | 154.0 | 336.0 | 10.6 | NaN | C |
| 27 | 27 | 4467.0 | D-penicillamine | 12398 | F | N | Y | N | N | NaN | 414.0 | 3.43 | 41.0 | 876.0 | 84.00 | 110.0 | 385.0 | 11.0 | 3.0 | C |
Stage
nan_train[nan_train['Stage'].isnull()].head(5)
| id | N_Days | Drug | Age | Sex | Ascites | Hepatomegaly | Spiders | Edema | Bilirubin | Cholesterol | Albumin | Copper | Alk_Phos | SGOT | Tryglicerides | Platelets | Prothrombin | Stage | Status | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 999.0 | D-penicillamine | 21532 | M | N | N | N | N | 2.3 | 316.0 | 3.35 | 172.0 | 1601.0 | 179.80 | 63.0 | 394.0 | 9.7 | NaN | D |
| 10 | 10 | 3581.0 | Placebo | 25772 | F | N | N | N | N | 0.5 | 252.0 | 3.60 | 26.0 | 377.0 | 56.76 | 185.0 | 336.0 | 10.0 | NaN | C |
| 17 | 17 | 1592.0 | D-penicillamine | 14872 | F | N | Y | N | N | 1.1 | 392.0 | 3.43 | 39.0 | 1395.0 | 184.45 | 133.0 | 328.0 | 11.2 | NaN | C |
| 20 | 20 | 3092.0 | Placebo | 15612 | F | N | Y | N | NaN | NaN | 303.0 | 3.10 | 70.0 | 1032.0 | 56.76 | 154.0 | 336.0 | 10.6 | NaN | C |
| 23 | 23 | 1152.0 | D-penicillamine | 16736 | F | N | Y | N | N | 1.1 | 373.0 | 3.90 | 69.0 | 1353.0 | 116.25 | 139.0 | 268.0 | 10.0 | NaN | C |
Change categorical variables to numeric in order to implement imputers
#change string to values to integers
train_simputer['Drug'] = np.where(train_simputer['Drug']=='D-penicillamine', 1, 0)
train_simputer['Sex'] = np.where(train_simputer['Sex']=='F', 1, 0)
train_simputer['Ascites'] = np.where(train_simputer['Ascites']=='Y', 1, 0)
train_simputer['Hepatomegaly'] = np.where(train_simputer['Hepatomegaly']=='Y', 1, 0)
train_simputer['Spiders'] = np.where(train_simputer['Spiders']=='Y', 1, 0)
train_simputer['Edema'] = train_simputer['Edema'].replace({'N': 0, 'Y': 1, 'S': 2 })
train_simputer['Status'] = train_simputer['Status'].replace({'D': 0, 'C': 1, 'CL': 2 })
Call simple imputer, here we will replace with most frequent value
#call simple imputer:
simple_imputer = SimpleImputer(missing_values = np.nan,
strategy ='most_frequent')
imputer = simple_imputer.fit_transform(train_simputer)
df_simple_imputed = pd.DataFrame(imputer, columns=train_simputer.columns)
df_simple_imputed.head()
| id | N_Days | Drug | Age | Sex | Ascites | Hepatomegaly | Spiders | Edema | Bilirubin | Cholesterol | Albumin | Copper | Alk_Phos | SGOT | Tryglicerides | Platelets | Prothrombin | Stage | Status | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.0 | 999.0 | 1.0 | 21532.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 2.3 | 316.0 | 3.35 | 172.0 | 1601.0 | 179.80 | 63.0 | 394.0 | 9.7 | 3.0 | 0.0 |
| 1 | 1.0 | 2574.0 | 0.0 | 19237.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.9 | 364.0 | 3.54 | 63.0 | 1440.0 | 134.85 | 88.0 | 361.0 | 11.0 | 3.0 | 1.0 |
| 2 | 2.0 | 3428.0 | 0.0 | 13727.0 | 1.0 | 0.0 | 1.0 | 1.0 | 1.0 | 3.3 | 299.0 | 3.55 | 131.0 | 1029.0 | 119.35 | 50.0 | 199.0 | 11.7 | 4.0 | 0.0 |
| 3 | 3.0 | 2576.0 | 0.0 | 18460.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.6 | 256.0 | 3.50 | 58.0 | 1653.0 | 71.30 | 96.0 | 269.0 | 10.7 | 3.0 | 1.0 |
| 4 | 4.0 | 788.0 | 0.0 | 16658.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.1 | 346.0 | 3.65 | 63.0 | 1181.0 | 125.55 | 96.0 | 298.0 | 10.6 | 4.0 | 1.0 |
df_simple_imputed.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 7905 entries, 0 to 7904 Data columns (total 20 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 id 7905 non-null float64 1 N_Days 7905 non-null float64 2 Drug 7905 non-null float64 3 Age 7905 non-null float64 4 Sex 7905 non-null float64 5 Ascites 7905 non-null float64 6 Hepatomegaly 7905 non-null float64 7 Spiders 7905 non-null float64 8 Edema 7905 non-null float64 9 Bilirubin 7905 non-null float64 10 Cholesterol 7905 non-null float64 11 Albumin 7905 non-null float64 12 Copper 7905 non-null float64 13 Alk_Phos 7905 non-null float64 14 SGOT 7905 non-null float64 15 Tryglicerides 7905 non-null float64 16 Platelets 7905 non-null float64 17 Prothrombin 7905 non-null float64 18 Stage 7905 non-null float64 19 Status 7905 non-null float64 dtypes: float64(20) memory usage: 1.2 MB
df_simple_imputed[df_simple_imputed['id'].isin([7, 25, 28, 33, 34])]
| id | N_Days | Drug | Age | Sex | Ascites | Hepatomegaly | Spiders | Edema | Bilirubin | Cholesterol | Albumin | Copper | Alk_Phos | SGOT | Tryglicerides | Platelets | Prothrombin | Stage | Status | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 7 | 7.0 | 1615.0 | 0.0 | 21281.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.6 | 273.0 | 3.94 | 36.0 | 598.0 | 52.70 | 214.0 | 227.0 | 9.9 | 3.0 | 1.0 |
| 25 | 25.0 | 799.0 | 0.0 | 27220.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.3 | 325.0 | 3.60 | 81.0 | 2065.0 | 232.50 | 100.0 | 277.0 | 11.1 | 4.0 | 1.0 |
| 28 | 28.0 | 2301.0 | 1.0 | 15105.0 | 1.0 | 0.0 | 1.0 | 1.0 | 0.0 | 2.3 | 528.0 | 3.34 | 173.0 | 1282.0 | 120.90 | 55.0 | 123.0 | 10.7 | 4.0 | 0.0 |
| 33 | 33.0 | 3090.0 | 0.0 | 20254.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 3.9 | 373.0 | 3.67 | 281.0 | 754.0 | 130.20 | 89.0 | 277.0 | 11.6 | 2.0 | 0.0 |
| 34 | 34.0 | 850.0 | 0.0 | 23235.0 | 1.0 | 1.0 | 1.0 | 1.0 | 1.0 | 1.3 | 178.0 | 2.93 | 188.0 | 1020.0 | 119.35 | 178.0 | 136.0 | 11.5 | 4.0 | 0.0 |
For N_Days the most common value was 1216, all NaN rows were replaced with this value.
df_simple_imputed[df_simple_imputed['id'].isin([1, 3, 11, 13, 19])]
| id | N_Days | Drug | Age | Sex | Ascites | Hepatomegaly | Spiders | Edema | Bilirubin | Cholesterol | Albumin | Copper | Alk_Phos | SGOT | Tryglicerides | Platelets | Prothrombin | Stage | Status | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1 | 1.0 | 2574.0 | 0.0 | 19237.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.9 | 364.0 | 3.54 | 63.0 | 1440.0 | 134.85 | 88.0 | 361.0 | 11.0 | 3.0 | 1.0 |
| 3 | 3.0 | 2576.0 | 0.0 | 18460.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.6 | 256.0 | 3.50 | 58.0 | 1653.0 | 71.30 | 96.0 | 269.0 | 10.7 | 3.0 | 1.0 |
| 11 | 11.0 | 1614.0 | 0.0 | 14106.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.9 | 328.0 | 3.61 | 62.0 | 1105.0 | 137.95 | 95.0 | 145.0 | 9.5 | 3.0 | 1.0 |
| 13 | 13.0 | 1153.0 | 1.0 | 22347.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.6 | 232.0 | 3.83 | 24.0 | 678.0 | 65.10 | 99.0 | 248.0 | 10.4 | 3.0 | 1.0 |
| 19 | 19.0 | 3358.0 | 0.0 | 17031.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.6 | 322.0 | 3.77 | 52.0 | 834.0 | 60.45 | 214.0 | 153.0 | 11.0 | 3.0 | 1.0 |
The most common value for edema was 0, or no edema.
df_simple_imputed[df_simple_imputed['id'].isin([3, 4, 15, 20, 27])]
| id | N_Days | Drug | Age | Sex | Ascites | Hepatomegaly | Spiders | Edema | Bilirubin | Cholesterol | Albumin | Copper | Alk_Phos | SGOT | Tryglicerides | Platelets | Prothrombin | Stage | Status | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 3 | 3.0 | 2576.0 | 0.0 | 18460.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.6 | 256.0 | 3.50 | 58.0 | 1653.0 | 71.30 | 96.0 | 269.0 | 10.7 | 3.0 | 1.0 |
| 4 | 4.0 | 788.0 | 0.0 | 16658.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.1 | 346.0 | 3.65 | 63.0 | 1181.0 | 125.55 | 96.0 | 298.0 | 10.6 | 4.0 | 1.0 |
| 15 | 15.0 | 1212.0 | 0.0 | 15112.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.7 | 335.0 | 3.54 | 44.0 | 1345.0 | 137.95 | 145.0 | 244.0 | 10.6 | 3.0 | 1.0 |
| 20 | 20.0 | 3092.0 | 0.0 | 15612.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.6 | 303.0 | 3.10 | 70.0 | 1032.0 | 56.76 | 154.0 | 336.0 | 10.6 | 4.0 | 1.0 |
| 27 | 27.0 | 4467.0 | 1.0 | 12398.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.2 | 414.0 | 3.43 | 41.0 | 876.0 | 84.00 | 110.0 | 385.0 | 11.0 | 3.0 | 1.0 |
The most common value for bilirubin 0.6.
df_simple_imputed[df_simple_imputed['id'].isin([0, 10, 17, 20, 23])]
| id | N_Days | Drug | Age | Sex | Ascites | Hepatomegaly | Spiders | Edema | Bilirubin | Cholesterol | Albumin | Copper | Alk_Phos | SGOT | Tryglicerides | Platelets | Prothrombin | Stage | Status | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.0 | 999.0 | 1.0 | 21532.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 2.3 | 316.0 | 3.35 | 172.0 | 1601.0 | 179.80 | 63.0 | 394.0 | 9.7 | 3.0 | 0.0 |
| 10 | 10.0 | 3581.0 | 0.0 | 25772.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.5 | 252.0 | 3.60 | 26.0 | 377.0 | 56.76 | 185.0 | 336.0 | 10.0 | 2.0 | 1.0 |
| 17 | 17.0 | 1592.0 | 1.0 | 14872.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.1 | 392.0 | 3.43 | 39.0 | 1395.0 | 184.45 | 133.0 | 328.0 | 11.2 | 2.0 | 1.0 |
| 20 | 20.0 | 3092.0 | 0.0 | 15612.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.6 | 303.0 | 3.10 | 70.0 | 1032.0 | 56.76 | 154.0 | 336.0 | 10.6 | 4.0 | 1.0 |
| 23 | 23.0 | 1152.0 | 1.0 | 16736.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.1 | 373.0 | 3.90 | 69.0 | 1353.0 | 116.25 | 139.0 | 268.0 | 10.0 | 4.0 | 1.0 |
The most common occuring value for stage is stage 3.
df_simple_imputed.to_csv('train_simple_imputer.csv', index=False)
Make a copy of our original nan dataframe:
train_mice = nan_train.copy(deep=True)
train_mice['Drug'] = np.where(train_mice['Drug']=='D-penicillamine', 1, 0)
train_mice['Sex'] = np.where(train_mice['Sex']=='F', 1, 0)
train_mice['Ascites'] = np.where(train_mice['Ascites']=='Y', 1, 0)
train_mice['Hepatomegaly'] = np.where(train_mice['Hepatomegaly']=='Y', 1, 0)
train_mice['Spiders'] = np.where(train_mice['Spiders']=='Y', 1, 0)
train_mice['Edema'] = train_mice['Edema'].replace({'N': 0, 'Y': 1, 'S': 2 })
train_mice['Status'] = train_mice['Status'].replace({'D': 0, 'C': 1, 'CL': 2 })
Call MICE Imputer:
mice_imputer = IterativeImputer()
imputed = mice_imputer.fit_transform(train_mice)
df_mice_imputed = pd.DataFrame(imputed, columns=train_mice.columns)
df_mice_imputed.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 7905 entries, 0 to 7904 Data columns (total 20 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 id 7905 non-null float64 1 N_Days 7905 non-null float64 2 Drug 7905 non-null float64 3 Age 7905 non-null float64 4 Sex 7905 non-null float64 5 Ascites 7905 non-null float64 6 Hepatomegaly 7905 non-null float64 7 Spiders 7905 non-null float64 8 Edema 7905 non-null float64 9 Bilirubin 7905 non-null float64 10 Cholesterol 7905 non-null float64 11 Albumin 7905 non-null float64 12 Copper 7905 non-null float64 13 Alk_Phos 7905 non-null float64 14 SGOT 7905 non-null float64 15 Tryglicerides 7905 non-null float64 16 Platelets 7905 non-null float64 17 Prothrombin 7905 non-null float64 18 Stage 7905 non-null float64 19 Status 7905 non-null float64 dtypes: float64(20) memory usage: 1.2 MB
df_mice_imputed.to_csv('train_mice_imputer.csv', index=False)
This method of imputation is based on the k-nearest neighbors approach, which uses a Euclidean distance metric to find values close the missing value. Based on the values that its nearest neighbors hold, the missing feature is imputed either by averaging equally across neighbors, or by weighting the distance to each neighbor.
Source: https://scikit-learn.org/stable/modules/impute.html#knnimpute
Make copy of dataframe with NaNs
train_knn = nan_train.copy(deep=True)
#change string to values to integers
train_knn['Drug'] = np.where(train_knn['Drug']=='D-penicillamine', 1, 0)
train_knn['Sex'] = np.where(train_knn['Sex']=='F', 1, 0)
train_knn['Ascites'] = np.where(train_knn['Ascites']=='Y', 1, 0)
train_knn['Hepatomegaly'] = np.where(train_knn['Hepatomegaly']=='Y', 1, 0)
train_knn['Spiders'] = np.where(train_knn['Spiders']=='Y', 1, 0)
train_knn['Edema'] = train_knn['Edema'].replace({'N': 0, 'Y': 1, 'S': 2 })
train_knn['Status'] = train_knn['Status'].replace({'D': 0, 'C': 1, 'CL': 2 })
#call KNNImputer
knn_imputer = KNNImputer(n_neighbors=5, weights="uniform")
imputer = knn_imputer.fit_transform(train_knn)
df_knn_imputed = pd.DataFrame(imputer, columns=train_knn.columns)
Here we have set the number of neighbors to 5 and the weights to to be uniform, i.e. all points in the neighborhood are weighted equally.
df_knn_imputed.head(20)
| id | N_Days | Drug | Age | Sex | Ascites | Hepatomegaly | Spiders | Edema | Bilirubin | Cholesterol | Albumin | Copper | Alk_Phos | SGOT | Tryglicerides | Platelets | Prothrombin | Stage | Status | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.0 | 999.0 | 1.0 | 21532.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 2.30 | 316.0 | 3.35 | 172.0 | 1601.0 | 179.80 | 63.0 | 394.0 | 9.7 | 3.2 | 0.0 |
| 1 | 1.0 | 2574.0 | 0.0 | 19237.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.4 | 0.90 | 364.0 | 3.54 | 63.0 | 1440.0 | 134.85 | 88.0 | 361.0 | 11.0 | 3.0 | 1.0 |
| 2 | 2.0 | 3428.0 | 0.0 | 13727.0 | 1.0 | 0.0 | 1.0 | 1.0 | 1.0 | 3.30 | 299.0 | 3.55 | 131.0 | 1029.0 | 119.35 | 50.0 | 199.0 | 11.7 | 4.0 | 0.0 |
| 3 | 3.0 | 2576.0 | 0.0 | 18460.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.4 | 1.30 | 256.0 | 3.50 | 58.0 | 1653.0 | 71.30 | 96.0 | 269.0 | 10.7 | 3.0 | 1.0 |
| 4 | 4.0 | 788.0 | 0.0 | 16658.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 2.02 | 346.0 | 3.65 | 63.0 | 1181.0 | 125.55 | 96.0 | 298.0 | 10.6 | 4.0 | 1.0 |
| 5 | 5.0 | 703.0 | 1.0 | 19270.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.60 | 227.0 | 3.46 | 34.0 | 6456.2 | 60.63 | 68.0 | 213.0 | 11.5 | 3.0 | 0.0 |
| 6 | 6.0 | 1300.0 | 0.0 | 17703.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.00 | 328.0 | 3.35 | 43.0 | 1677.0 | 137.95 | 90.0 | 291.0 | 9.8 | 3.0 | 1.0 |
| 7 | 7.0 | 1787.0 | 0.0 | 21281.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.60 | 273.0 | 3.94 | 36.0 | 598.0 | 52.70 | 214.0 | 227.0 | 9.9 | 3.0 | 1.0 |
| 8 | 8.0 | 2050.0 | 1.0 | 20684.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.70 | 360.0 | 3.65 | 72.0 | 3196.0 | 94.55 | 154.0 | 269.0 | 9.8 | 2.0 | 1.0 |
| 9 | 9.0 | 2615.0 | 1.0 | 15009.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.90 | 478.0 | 3.60 | 39.0 | 1758.0 | 171.00 | 140.0 | 234.0 | 10.6 | 2.0 | 1.0 |
| 10 | 10.0 | 3581.0 | 0.0 | 25772.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.50 | 252.0 | 3.60 | 26.0 | 377.0 | 56.76 | 185.0 | 336.0 | 10.0 | 3.4 | 1.0 |
| 11 | 11.0 | 1614.0 | 0.0 | 14106.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.4 | 0.90 | 328.0 | 3.61 | 62.0 | 1105.0 | 137.95 | 95.0 | 145.0 | 9.5 | 3.0 | 1.0 |
| 12 | 12.0 | 1847.0 | 0.0 | 12279.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.60 | 232.0 | 3.68 | 38.0 | 1029.0 | 128.65 | 99.0 | 273.0 | 10.7 | 2.0 | 1.0 |
| 13 | 13.0 | 1153.0 | 1.0 | 22347.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.60 | 232.0 | 3.83 | 24.0 | 678.0 | 65.10 | 99.0 | 248.0 | 10.4 | 3.0 | 1.0 |
| 14 | 14.0 | 904.0 | 1.0 | 22388.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 3.90 | 304.0 | 3.20 | 13.0 | 1440.0 | 153.45 | 169.0 | 156.0 | 10.0 | 3.0 | 0.0 |
| 15 | 15.0 | 1212.0 | 0.0 | 15112.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.88 | 335.0 | 3.54 | 44.0 | 1345.0 | 137.95 | 145.0 | 244.0 | 10.6 | 3.0 | 1.0 |
| 16 | 16.0 | 1967.0 | 0.0 | 17884.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.70 | 328.0 | 3.58 | 39.0 | 1065.0 | 98.00 | 78.0 | 259.0 | 11.7 | 2.0 | 1.0 |
| 17 | 17.0 | 1592.0 | 1.0 | 14872.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.10 | 392.0 | 3.43 | 39.0 | 1395.0 | 184.45 | 133.0 | 328.0 | 11.2 | 2.4 | 1.0 |
| 18 | 18.0 | 1481.0 | 0.0 | 18302.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.00 | 259.0 | 3.85 | 67.0 | 936.0 | 134.85 | 139.0 | 341.0 | 9.6 | 3.0 | 1.0 |
| 19 | 19.0 | 3358.0 | 0.0 | 17031.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.60 | 322.0 | 3.77 | 52.0 | 834.0 | 60.45 | 214.0 | 153.0 | 11.0 | 3.0 | 1.0 |
df_knn_imputed.to_csv('train_knn_imputer.csv', index=False)
#load imputed datasets
df_simple_imputed = pd.read_csv ('train_simple_imputer.csv')
df_mice_imputed = pd.read_csv('train_mice_imputer.csv')
df_knn_imputed = pd.read_csv('train_knn_imputer.csv')
print("Simple Imputed shape:",df_simple_imputed.shape)
print("MICE shape:",df_mice_imputed.shape)
print("KNN Imputed shape:",df_knn_imputed.shape)
Simple Imputed shape: (7905, 20) MICE shape: (7905, 20) KNN Imputed shape: (7905, 20)
First, pass all datasets through pipeline, then save each x and y with their respective imputation methods (example X_simple_imputer, y_simple_imputer) and then use each to train RF model.
X_tmp.head(10)
| N_Days | Drug | Age | Sex | Ascites | Hepatomegaly | Spiders | Edema | Bilirubin | Cholesterol | Albumin | Copper | Alk_Phos | SGOT | Tryglicerides | Platelets | Prothrombin | Stage | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 999.0 | 1.0 | 21532.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.832909 | 5.755742 | 3.35 | 5.147494 | 7.378384 | 5.191845 | 4.143135 | 394.0 | 2.272126 | 3.2 |
| 1 | 2574.0 | 0.0 | 19237.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.4 | -0.105361 | 5.897154 | 3.54 | 4.143135 | 7.272398 | 4.904163 | 4.477337 | 361.0 | 2.397895 | 3.0 |
| 2 | 3428.0 | 0.0 | 13727.0 | 1.0 | 0.0 | 1.0 | 1.0 | 1.0 | 1.193922 | 5.700444 | 3.55 | 4.875197 | 6.936343 | 4.782060 | 3.912023 | 199.0 | 2.459589 | 4.0 |
| 3 | 2576.0 | 0.0 | 18460.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.4 | 0.262364 | 5.545177 | 3.50 | 4.060443 | 7.410347 | 4.266896 | 4.564348 | 269.0 | 2.370244 | 3.0 |
| 4 | 788.0 | 0.0 | 16658.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.703098 | 5.846439 | 3.65 | 4.143135 | 7.074117 | 4.832704 | 4.564348 | 298.0 | 2.360854 | 4.0 |
| 5 | 703.0 | 1.0 | 19270.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | -0.510826 | 5.424950 | 3.46 | 3.526361 | 8.772796 | 4.104790 | 4.219508 | 213.0 | 2.442347 | 3.0 |
| 6 | 1300.0 | 0.0 | 17703.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.000000 | 5.793014 | 3.35 | 3.761200 | 7.424762 | 4.926891 | 4.499810 | 291.0 | 2.282382 | 3.0 |
| 7 | 1787.0 | 0.0 | 21281.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | -0.510826 | 5.609472 | 3.94 | 3.583519 | 6.393591 | 3.964615 | 5.365976 | 227.0 | 2.292535 | 3.0 |
| 8 | 2050.0 | 1.0 | 20684.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | -0.356675 | 5.886104 | 3.65 | 4.276666 | 8.069655 | 4.549129 | 5.036953 | 269.0 | 2.282382 | 2.0 |
| 9 | 2615.0 | 1.0 | 15009.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | -0.105361 | 6.169611 | 3.60 | 3.663562 | 7.471932 | 5.141664 | 4.941642 | 234.0 | 2.360854 | 2.0 |
X_tmp['Stage'] = X_tmp['Stage'].round(0)
# Preprocess response data
X_ready = pd.DataFrame(full_input_preprocessor.fit_transform(X_tmp),
columns=full_input_preprocessor.get_feature_names_out())
X = X_ready
y_tmp.replace({1.0: 'C', 0.0: 'D', 2.0: 'CL'}, inplace=True)
y_tmp.replace({'C':0, 'CL': 1.0, 'D':2.0}, inplace=True)
# Preprocess response data
y_ready = full_response_preprocessor.fit_transform(y_tmp)
y = y_ready
X.head(10)
| num__SGOT | num__Platelets | num__Prothrombin | num__Tryglicerides | num__N_Days | num__Alk_Phos | num__Cholesterol | num__Copper | num__Bilirubin | num__Albumin | ... | ord__Sex | ord__Ascites | ord__Hepatomegaly | ord__Spiders | ord__Edema | ord__Drug | nom__Stage_1.0 | nom__Stage_2.0 | nom__Stage_3.0 | nom__Stage_4.0 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1.310474 | 1.474692 | -1.278946 | -1.332898 | -1.012680 | 0.244603 | -0.026949 | 1.315397 | 0.460876 | -0.574110 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 |
| 1 | 0.604235 | 1.097235 | 0.544922 | -0.478656 | 0.540773 | 0.089660 | 0.336825 | 0.022860 | -0.573725 | -0.024807 | ... | 1.0 | 0.0 | 0.0 | 0.0 | 2.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 |
| 2 | 0.304481 | -0.755736 | 1.439583 | -1.923634 | 1.383090 | -0.401633 | -0.169201 | 0.964971 | 0.858954 | 0.004104 | ... | 1.0 | 0.0 | 1.0 | 1.0 | 5.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 |
| 3 | -0.960211 | 0.044930 | 0.143928 | -0.256250 | 0.542746 | 0.291332 | -0.568615 | -0.083558 | -0.168247 | -0.140450 | ... | 1.0 | 0.0 | 0.0 | 0.0 | 2.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 |
| 4 | 0.428808 | 0.376635 | 0.007761 | -0.256250 | -1.220793 | -0.200216 | 0.206363 | 0.022860 | 0.317736 | 0.293211 | ... | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 |
| 5 | -1.358171 | -0.595603 | 1.189548 | -1.137684 | -1.304630 | 2.283148 | -0.877894 | -0.770883 | -1.020819 | -0.256092 | ... | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 |
| 6 | 0.660031 | 0.296568 | -1.130210 | -0.421214 | -0.715798 | 0.312405 | 0.068930 | -0.468662 | -0.457548 | -0.574110 | ... | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 |
| 7 | -1.702289 | -0.435469 | -0.982983 | 1.792761 | -0.235460 | -1.195102 | -0.403222 | -0.697324 | -1.020819 | 1.131621 | ... | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 |
| 8 | -0.267350 | 0.044930 | -1.130210 | 0.951756 | 0.023942 | 1.255199 | 0.308400 | 0.194705 | -0.850842 | 0.293211 | ... | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 |
| 9 | 1.187282 | -0.355403 | 0.007761 | 0.708138 | 0.581212 | 0.381365 | 1.037706 | -0.594315 | -0.573725 | 0.148657 | ... | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 |
10 rows × 21 columns
Saving the data from the simple imputer dataframe and copying it, to then input the mice and knn imputer dataframes and save them separately (so as to not copy the pipeline again)
X_simple_imputer = X.copy()
y_simple_imputer = y.copy()
X_mice_imputer = X.copy()
y_mice_imputer = y.copy()
X_knn_imputer = X.copy()
y_knn_imputer = y.copy()
rm_classifier_tuned = RandomForestClassifier(max_depth=15, min_samples_leaf=10, min_samples_split=30, n_estimators=100, random_state=79)
Pass simple imputation data
y_pred_proba = cross_val_predict(rm_classifier_tuned, X_simple_imputer, y_simple_imputer, cv=5, method='predict_proba')
y_pred = np.argmax(y_pred_proba, axis=1)
null_accuracy = y_tmp.value_counts(normalize=True)
print(f'Simple Imputer log loss: {log_loss(y, y_pred_proba)}')
print(f'Simple Imputer Validation accuracy: {accuracy_score(y, y_pred)}')
print(f'Simple Imputer Null Accuracy: {null_accuracy}')
Simple Imputer log loss: 0.4941540755674212 Simple Imputer Validation accuracy: 0.8084917617237009 Simple Imputer Null Accuracy: 0.0 0.628644 2.0 0.336502 1.0 0.034854 Name: Status, dtype: float64
classif_report = classification_report(y, y_pred)
print (classif_report)
precision recall f1-score support
0 0.82 0.91 0.87 4960
1 0.00 0.00 0.00 275
2 0.77 0.69 0.73 2655
accuracy 0.81 7890
macro avg 0.53 0.54 0.53 7890
weighted avg 0.78 0.81 0.79 7890
/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1471: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1471: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1471: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result))
conf_matrix = confusion_matrix(y,y_pred)
confusion_matrix_plot(conf_matrix,'Random Forest: Simple Imputer')
normalised_confusion_matrix_plot(conf_matrix,'Random Forest: Simple Imputer')
rm_classifier_tuned = RandomForestClassifier(max_depth=15, min_samples_leaf=10, min_samples_split=30, n_estimators=100, random_state=79)
y_pred_proba = cross_val_predict(rm_classifier_tuned, X_mice_imputer, y_mice_imputer, cv=5, method='predict_proba')
y_pred = np.argmax(y_pred_proba, axis=1)
null_accuracy = y_tmp.value_counts(normalize=True)
print(f'MICE log loss: {log_loss(y, y_pred_proba)}')
print(f'Validation accuracy: {accuracy_score(y, y_pred)}')
print(f'Null Accuracy: {null_accuracy}')
MICE log loss: 0.4941540755674212 Validation accuracy: 0.8084917617237009 Null Accuracy: 0.0 0.628644 2.0 0.336502 1.0 0.034854 Name: Status, dtype: float64
classif_report = classification_report(y, y_pred)
print (classif_report)
precision recall f1-score support
0 0.82 0.91 0.87 4960
1 0.00 0.00 0.00 275
2 0.77 0.69 0.73 2655
accuracy 0.81 7890
macro avg 0.53 0.54 0.53 7890
weighted avg 0.78 0.81 0.79 7890
/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1471: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1471: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1471: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result))
conf_matrix = confusion_matrix(y,y_pred)
confusion_matrix_plot(conf_matrix,'Random Forest: MICE Imputer')
normalised_confusion_matrix_plot(conf_matrix,'Random Forest: MICE Imputer')
rm_classifier_tuned = RandomForestClassifier(max_depth=15, min_samples_leaf=10, min_samples_split=30, n_estimators=100, random_state=79)
y_pred_proba = cross_val_predict(rm_classifier_tuned, X_knn_imputer, y_knn_imputer, cv=5, method='predict_proba')
y_pred = np.argmax(y_pred_proba, axis=1)
null_accuracy = y_tmp.value_counts(normalize=True)
print(f'KNN Imputer log loss: {log_loss(y, y_pred_proba)}')
print(f'Validation accuracy: {accuracy_score(y, y_pred)}')
print(f'Null Accuracy: {null_accuracy}')
KNN Imputer log loss: 0.4941540755674212 Validation accuracy: 0.8084917617237009 Null Accuracy: 0.0 0.628644 2.0 0.336502 1.0 0.034854 Name: Status, dtype: float64
classif_report = classification_report(y, y_pred)
print (classif_report)
precision recall f1-score support
0 0.82 0.91 0.87 4960
1 0.00 0.00 0.00 275
2 0.77 0.69 0.73 2655
accuracy 0.81 7890
macro avg 0.53 0.54 0.53 7890
weighted avg 0.78 0.81 0.79 7890
/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1471: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1471: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1471: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result))
conf_matrix = confusion_matrix(y,y_pred)
confusion_matrix_plot(conf_matrix,'Random Forest: KNN Imputer')
normalised_confusion_matrix_plot(conf_matrix,'Random Forest: KNN Imputer')
The log losses for the simple, mice and knn imputers were 0.49, 0.51, and 0.51, respectively. Overall, not a significant difference from the original dataset, although slightly improved with the simple imputation method, which could be explained by the fact that the simple imputation method uses the most common value for each feature, so it is more likely that a predicted label will be similar to others in the dataset, essentially it is predicting over what is most likely to be expected.